LCOV - llvm-toolchain.info - lib/Target/AMDGPU/AMDGPUISelLowering.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AMDGPU - AMDGPUISelLowering.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	1540	1831	84.1 %
Date:	2018-10-20 13:21:21	Functions:	109	120	90.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This is the parent TargetLowering class for hardware code gen
      12             : /// targets.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f
      17             : #define AMDGPU_LN2_F       0.693147180559945309417232121458176568f
      18             : #define AMDGPU_LN10_F      2.30258509299404568401799145468436421f
      19             : 
      20             : #include "AMDGPUISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUCallLowering.h"
      23             : #include "AMDGPUFrameLowering.h"
      24             : #include "AMDGPUIntrinsicInfo.h"
      25             : #include "AMDGPURegisterInfo.h"
      26             : #include "AMDGPUSubtarget.h"
      27             : #include "AMDGPUTargetMachine.h"
      28             : #include "Utils/AMDGPUBaseInfo.h"
      29             : #include "R600MachineFunctionInfo.h"
      30             : #include "SIInstrInfo.h"
      31             : #include "SIMachineFunctionInfo.h"
      32             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      33             : #include "llvm/CodeGen/Analysis.h"
      34             : #include "llvm/CodeGen/CallingConvLower.h"
      35             : #include "llvm/CodeGen/MachineFunction.h"
      36             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      37             : #include "llvm/CodeGen/SelectionDAG.h"
      38             : #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
      39             : #include "llvm/IR/DataLayout.h"
      40             : #include "llvm/IR/DiagnosticInfo.h"
      41             : #include "llvm/Support/KnownBits.h"
      42             : using namespace llvm;
      43             : 
      44           0 : static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
      45             :                            CCValAssign::LocInfo LocInfo,
      46             :                            ISD::ArgFlagsTy ArgFlags, CCState &State,
      47             :                            const TargetRegisterClass *RC,
      48             :                            unsigned NumRegs) {
      49           0 :   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
      50           0 :   unsigned RegResult = State.AllocateReg(RegList);
      51           0 :   if (RegResult == AMDGPU::NoRegister)
      52           0 :     return false;
      53             : 
      54           0 :   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
      55           0 :   return true;
      56             : }
      57             : 
      58           0 : static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      59             :                               CCValAssign::LocInfo LocInfo,
      60             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      61           0 :   switch (LocVT.SimpleTy) {
      62           0 :   case MVT::i64:
      63             :   case MVT::f64:
      64             :   case MVT::v2i32:
      65             :   case MVT::v2f32:
      66             :   case MVT::v4i16:
      67             :   case MVT::v4f16: {
      68             :     // Up to SGPR0-SGPR39
      69           0 :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      70           0 :                           &AMDGPU::SGPR_64RegClass, 20);
      71             :   }
      72             :   default:
      73             :     return false;
      74             :   }
      75             : }
      76             : 
      77             : // Allocate up to VGPR31.
      78             : //
      79             : // TODO: Since there are no VGPR alignent requirements would it be better to
      80             : // split into individual scalar registers?
      81           0 : static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      82             :                               CCValAssign::LocInfo LocInfo,
      83             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      84           0 :   switch (LocVT.SimpleTy) {
      85           0 :   case MVT::i64:
      86             :   case MVT::f64:
      87             :   case MVT::v2i32:
      88             :   case MVT::v2f32:
      89             :   case MVT::v4i16:
      90             :   case MVT::v4f16: {
      91           0 :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      92           0 :                           &AMDGPU::VReg_64RegClass, 31);
      93             :   }
      94           0 :   case MVT::v4i32:
      95             :   case MVT::v4f32:
      96             :   case MVT::v2i64:
      97             :   case MVT::v2f64: {
      98           0 :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      99           0 :                           &AMDGPU::VReg_128RegClass, 29);
     100             :   }
     101           0 :   case MVT::v8i32:
     102             :   case MVT::v8f32: {
     103           0 :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     104           0 :                           &AMDGPU::VReg_256RegClass, 25);
     105             : 
     106             :   }
     107           0 :   case MVT::v16i32:
     108             :   case MVT::v16f32: {
     109           0 :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     110           0 :                           &AMDGPU::VReg_512RegClass, 17);
     111             : 
     112             :   }
     113             :   default:
     114             :     return false;
     115             :   }
     116             : }
     117             : 
     118             : #include "AMDGPUGenCallingConv.inc"
     119             : 
     120             : // Find a larger type to do a load / store of a vector with.
     121        7341 : EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     122             :   unsigned StoreSize = VT.getStoreSizeInBits();
     123        7341 :   if (StoreSize <= 32)
     124        1738 :     return EVT::getIntegerVT(Ctx, StoreSize);
     125             : 
     126             :   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     127       11206 :   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     128             : }
     129             : 
     130       12389 : unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
     131       12389 :   KnownBits Known;
     132       12389 :   EVT VT = Op.getValueType();
     133       12389 :   DAG.computeKnownBits(Op, Known);
     134             : 
     135       24778 :   return VT.getSizeInBits() - Known.countMinLeadingZeros();
     136             : }
     137             : 
     138        6168 : unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
     139        6168 :   EVT VT = Op.getValueType();
     140             : 
     141             :   // In order for this to be a signed 24-bit value, bit 23, must
     142             :   // be a sign bit.
     143        6168 :   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
     144             : }
     145             : 
     146        2783 : AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     147        2783 :                                            const AMDGPUSubtarget &STI)
     148        2783 :     : TargetLowering(TM), Subtarget(&STI) {
     149             :   // Lower floating point store/load to integer store/load to reduce the number
     150             :   // of patterns in tablegen.
     151             :   setOperationAction(ISD::LOAD, MVT::f32, Promote);
     152             :   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
     153             : 
     154             :   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
     155             :   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
     156             : 
     157             :   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
     158             :   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
     159             : 
     160             :   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
     161             :   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
     162             : 
     163             :   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
     164             :   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
     165             : 
     166             :   setOperationAction(ISD::LOAD, MVT::i64, Promote);
     167             :   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
     168             : 
     169             :   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
     170             :   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
     171             : 
     172             :   setOperationAction(ISD::LOAD, MVT::f64, Promote);
     173             :   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
     174             : 
     175             :   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
     176             :   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
     177             : 
     178             :   // There are no 64-bit extloads. These should be done as a 32-bit extload and
     179             :   // an extension to 64-bit.
     180       19481 :   for (MVT VT : MVT::integer_valuetypes()) {
     181             :     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
     182             :     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
     183             :     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
     184             :   }
     185             : 
     186       19481 :   for (MVT VT : MVT::integer_valuetypes()) {
     187       16698 :     if (VT == MVT::i64)
     188             :       continue;
     189             : 
     190             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     191             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
     192             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
     193             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
     194             : 
     195             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     196             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
     197             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
     198             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
     199             : 
     200             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     201             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
     202             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
     203             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
     204             :   }
     205             : 
     206      200376 :   for (MVT VT : MVT::integer_vector_valuetypes()) {
     207             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
     208             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
     209             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
     210             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
     211             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
     212             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
     213             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
     214             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
     215             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
     216             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
     217             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
     218             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
     219             :   }
     220             : 
     221        2783 :   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     222             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
     223             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
     224             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
     225             : 
     226             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     227             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
     228             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
     229             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
     230             : 
     231             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     232             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
     233             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
     234             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
     235             : 
     236             :   setOperationAction(ISD::STORE, MVT::f32, Promote);
     237             :   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
     238             : 
     239             :   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
     240             :   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
     241             : 
     242             :   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
     243             :   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
     244             : 
     245             :   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
     246             :   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
     247             : 
     248             :   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
     249             :   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
     250             : 
     251             :   setOperationAction(ISD::STORE, MVT::i64, Promote);
     252             :   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
     253             : 
     254             :   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
     255             :   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
     256             : 
     257             :   setOperationAction(ISD::STORE, MVT::f64, Promote);
     258             :   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
     259             : 
     260             :   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
     261             :   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
     262             : 
     263             :   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
     264             :   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
     265             :   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     266             :   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
     267             : 
     268             :   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
     269             :   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
     270             :   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
     271             :   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
     272             : 
     273             :   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
     274             :   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
     275             :   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
     276             :   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
     277             : 
     278             :   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
     279             :   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
     280             : 
     281             :   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
     282             :   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
     283             : 
     284             :   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
     285             :   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
     286             : 
     287             :   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
     288             :   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
     289             : 
     290             : 
     291             :   setOperationAction(ISD::Constant, MVT::i32, Legal);
     292             :   setOperationAction(ISD::Constant, MVT::i64, Legal);
     293             :   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
     294             :   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
     295             : 
     296             :   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
     297             :   setOperationAction(ISD::BRIND, MVT::Other, Expand);
     298             : 
     299             :   // This is totally unsupported, just custom lower to produce an error.
     300             :   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
     301             : 
     302             :   // Library functions.  These default to Expand, but we have instructions
     303             :   // for them.
     304             :   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     305             :   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
     306             :   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
     307             :   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
     308             :   setOperationAction(ISD::FABS,   MVT::f32, Legal);
     309             :   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     310             :   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
     311             :   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     312             :   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     313             :   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
     314             : 
     315             :   setOperationAction(ISD::FROUND, MVT::f32, Custom);
     316             :   setOperationAction(ISD::FROUND, MVT::f64, Custom);
     317             : 
     318             :   setOperationAction(ISD::FLOG, MVT::f32, Custom);
     319             :   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
     320             :   setOperationAction(ISD::FEXP, MVT::f32, Custom);
     321             : 
     322             : 
     323             :   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
     324             :   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
     325             : 
     326             :   setOperationAction(ISD::FREM, MVT::f32, Custom);
     327             :   setOperationAction(ISD::FREM, MVT::f64, Custom);
     328             : 
     329             :   // Expand to fneg + fadd.
     330             :   setOperationAction(ISD::FSUB, MVT::f64, Expand);
     331             : 
     332             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
     333             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
     334             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
     335             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
     336             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
     337             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
     338             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
     339             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
     340             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
     341             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
     342             : 
     343             :   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
     344             :   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
     345             :   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     346             : 
     347        2783 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     348        8349 :   for (MVT VT : ScalarIntVTs) {
     349             :     // These should use [SU]DIVREM, so set them to expand
     350             :     setOperationAction(ISD::SDIV, VT, Expand);
     351             :     setOperationAction(ISD::UDIV, VT, Expand);
     352             :     setOperationAction(ISD::SREM, VT, Expand);
     353             :     setOperationAction(ISD::UREM, VT, Expand);
     354             : 
     355             :     // GPU does not have divrem function for signed or unsigned.
     356             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     357             :     setOperationAction(ISD::UDIVREM, VT, Custom);
     358             : 
     359             :     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
     360             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     361             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     362             : 
     363             :     setOperationAction(ISD::BSWAP, VT, Expand);
     364             :     setOperationAction(ISD::CTTZ, VT, Expand);
     365             :     setOperationAction(ISD::CTLZ, VT, Expand);
     366             : 
     367             :     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
     368             :     setOperationAction(ISD::ADDC, VT, Legal);
     369             :     setOperationAction(ISD::SUBC, VT, Legal);
     370             :     setOperationAction(ISD::ADDE, VT, Legal);
     371             :     setOperationAction(ISD::SUBE, VT, Legal);
     372             :   }
     373             : 
     374             :   // The hardware supports 32-bit ROTR, but not ROTL.
     375             :   setOperationAction(ISD::ROTL, MVT::i32, Expand);
     376             :   setOperationAction(ISD::ROTL, MVT::i64, Expand);
     377             :   setOperationAction(ISD::ROTR, MVT::i64, Expand);
     378             : 
     379             :   setOperationAction(ISD::MUL, MVT::i64, Expand);
     380             :   setOperationAction(ISD::MULHU, MVT::i64, Expand);
     381             :   setOperationAction(ISD::MULHS, MVT::i64, Expand);
     382             :   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     383             :   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     384             :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     385             :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     386             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     387             : 
     388             :   setOperationAction(ISD::SMIN, MVT::i32, Legal);
     389             :   setOperationAction(ISD::UMIN, MVT::i32, Legal);
     390             :   setOperationAction(ISD::SMAX, MVT::i32, Legal);
     391             :   setOperationAction(ISD::UMAX, MVT::i32, Legal);
     392             : 
     393             :   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
     394             :   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
     395             :   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
     396             :   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     397             : 
     398             :   static const MVT::SimpleValueType VectorIntTypes[] = {
     399             :     MVT::v2i32, MVT::v4i32
     400             :   };
     401             : 
     402        8349 :   for (MVT VT : VectorIntTypes) {
     403             :     // Expand the following operations for the current type by default.
     404             :     setOperationAction(ISD::ADD,  VT, Expand);
     405             :     setOperationAction(ISD::AND,  VT, Expand);
     406             :     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     407             :     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     408             :     setOperationAction(ISD::MUL,  VT, Expand);
     409             :     setOperationAction(ISD::MULHU, VT, Expand);
     410             :     setOperationAction(ISD::MULHS, VT, Expand);
     411             :     setOperationAction(ISD::OR,   VT, Expand);
     412             :     setOperationAction(ISD::SHL,  VT, Expand);
     413             :     setOperationAction(ISD::SRA,  VT, Expand);
     414             :     setOperationAction(ISD::SRL,  VT, Expand);
     415             :     setOperationAction(ISD::ROTL, VT, Expand);
     416             :     setOperationAction(ISD::ROTR, VT, Expand);
     417             :     setOperationAction(ISD::SUB,  VT, Expand);
     418             :     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     419             :     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     420             :     setOperationAction(ISD::SDIV, VT, Expand);
     421             :     setOperationAction(ISD::UDIV, VT, Expand);
     422             :     setOperationAction(ISD::SREM, VT, Expand);
     423             :     setOperationAction(ISD::UREM, VT, Expand);
     424             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     425             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     426             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     427             :     setOperationAction(ISD::UDIVREM, VT, Expand);
     428             :     setOperationAction(ISD::SELECT, VT, Expand);
     429             :     setOperationAction(ISD::VSELECT, VT, Expand);
     430             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     431             :     setOperationAction(ISD::XOR,  VT, Expand);
     432             :     setOperationAction(ISD::BSWAP, VT, Expand);
     433             :     setOperationAction(ISD::CTPOP, VT, Expand);
     434             :     setOperationAction(ISD::CTTZ, VT, Expand);
     435             :     setOperationAction(ISD::CTLZ, VT, Expand);
     436             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     437             :     setOperationAction(ISD::SETCC, VT, Expand);
     438             :   }
     439             : 
     440             :   static const MVT::SimpleValueType FloatVectorTypes[] = {
     441             :     MVT::v2f32, MVT::v4f32
     442             :   };
     443             : 
     444        8349 :   for (MVT VT : FloatVectorTypes) {
     445             :     setOperationAction(ISD::FABS, VT, Expand);
     446             :     setOperationAction(ISD::FMINNUM, VT, Expand);
     447             :     setOperationAction(ISD::FMAXNUM, VT, Expand);
     448             :     setOperationAction(ISD::FADD, VT, Expand);
     449             :     setOperationAction(ISD::FCEIL, VT, Expand);
     450             :     setOperationAction(ISD::FCOS, VT, Expand);
     451             :     setOperationAction(ISD::FDIV, VT, Expand);
     452             :     setOperationAction(ISD::FEXP2, VT, Expand);
     453             :     setOperationAction(ISD::FEXP, VT, Expand);
     454             :     setOperationAction(ISD::FLOG2, VT, Expand);
     455             :     setOperationAction(ISD::FREM, VT, Expand);
     456             :     setOperationAction(ISD::FLOG, VT, Expand);
     457             :     setOperationAction(ISD::FLOG10, VT, Expand);
     458             :     setOperationAction(ISD::FPOW, VT, Expand);
     459             :     setOperationAction(ISD::FFLOOR, VT, Expand);
     460             :     setOperationAction(ISD::FTRUNC, VT, Expand);
     461             :     setOperationAction(ISD::FMUL, VT, Expand);
     462             :     setOperationAction(ISD::FMA, VT, Expand);
     463             :     setOperationAction(ISD::FRINT, VT, Expand);
     464             :     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     465             :     setOperationAction(ISD::FSQRT, VT, Expand);
     466             :     setOperationAction(ISD::FSIN, VT, Expand);
     467             :     setOperationAction(ISD::FSUB, VT, Expand);
     468             :     setOperationAction(ISD::FNEG, VT, Expand);
     469             :     setOperationAction(ISD::VSELECT, VT, Expand);
     470             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     471             :     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     472             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     473             :     setOperationAction(ISD::SETCC, VT, Expand);
     474             :     setOperationAction(ISD::FCANONICALIZE, VT, Expand);
     475             :   }
     476             : 
     477             :   // This causes using an unrolled select operation rather than expansion with
     478             :   // bit operations. This is in general better, but the alternative using BFI
     479             :   // instructions may be better if the select sources are SGPRs.
     480             :   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
     481             :   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
     482             : 
     483             :   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
     484             :   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
     485             : 
     486             :   // There are no libcalls of any kind.
     487     1310793 :   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
     488             :     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
     489             : 
     490             :   setBooleanContents(ZeroOrNegativeOneBooleanContent);
     491             :   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     492             : 
     493             :   setSchedulingPreference(Sched::RegPressure);
     494        2783 :   setJumpIsExpensive(true);
     495             : 
     496             :   // FIXME: This is only partially true. If we have to do vector compares, any
     497             :   // SGPR pair can be a condition register. If we have a uniform condition, we
     498             :   // are better off doing SALU operations, where there is only one SCC. For now,
     499             :   // we don't have a way of knowing during instruction selection if a condition
     500             :   // will be uniform and we always use vector compares. Assume we are using
     501             :   // vector compares until that is fixed.
     502             :   setHasMultipleConditionRegisters(true);
     503             : 
     504        2783 :   PredictableSelectIsExpensive = false;
     505             : 
     506             :   // We want to find all load dependencies for long chains of stores to enable
     507             :   // merging into very wide vectors. The problem is with vectors with > 4
     508             :   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
     509             :   // vectors are a legal type, even though we have to split the loads
     510             :   // usually. When we can more precisely specify load legality per address
     511             :   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
     512             :   // smarter so that they can figure out what to do in 2 iterations without all
     513             :   // N > 4 stores on the same chain.
     514        2783 :   GatherAllAliasesMaxDepth = 16;
     515             : 
     516             :   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
     517             :   // about these during lowering.
     518        2783 :   MaxStoresPerMemcpy  = 0xffffffff;
     519        2783 :   MaxStoresPerMemmove = 0xffffffff;
     520        2783 :   MaxStoresPerMemset  = 0xffffffff;
     521             : 
     522             :   setTargetDAGCombine(ISD::BITCAST);
     523             :   setTargetDAGCombine(ISD::SHL);
     524             :   setTargetDAGCombine(ISD::SRA);
     525             :   setTargetDAGCombine(ISD::SRL);
     526             :   setTargetDAGCombine(ISD::TRUNCATE);
     527             :   setTargetDAGCombine(ISD::MUL);
     528             :   setTargetDAGCombine(ISD::MULHU);
     529             :   setTargetDAGCombine(ISD::MULHS);
     530             :   setTargetDAGCombine(ISD::SELECT);
     531             :   setTargetDAGCombine(ISD::SELECT_CC);
     532             :   setTargetDAGCombine(ISD::STORE);
     533             :   setTargetDAGCombine(ISD::FADD);
     534             :   setTargetDAGCombine(ISD::FSUB);
     535             :   setTargetDAGCombine(ISD::FNEG);
     536             :   setTargetDAGCombine(ISD::FABS);
     537             :   setTargetDAGCombine(ISD::AssertZext);
     538             :   setTargetDAGCombine(ISD::AssertSext);
     539        2783 : }
     540             : 
     541             : //===----------------------------------------------------------------------===//
     542             : // Target Information
     543             : //===----------------------------------------------------------------------===//
     544             : 
     545             : LLVM_READNONE
     546        1014 : static bool fnegFoldsIntoOp(unsigned Opc) {
     547        1014 :   switch (Opc) {
     548             :   case ISD::FADD:
     549             :   case ISD::FSUB:
     550             :   case ISD::FMUL:
     551             :   case ISD::FMA:
     552             :   case ISD::FMAD:
     553             :   case ISD::FMINNUM:
     554             :   case ISD::FMAXNUM:
     555             :   case ISD::FSIN:
     556             :   case ISD::FTRUNC:
     557             :   case ISD::FRINT:
     558             :   case ISD::FNEARBYINT:
     559             :   case ISD::FCANONICALIZE:
     560             :   case AMDGPUISD::RCP:
     561             :   case AMDGPUISD::RCP_LEGACY:
     562             :   case AMDGPUISD::RCP_IFLAG:
     563             :   case AMDGPUISD::SIN_HW:
     564             :   case AMDGPUISD::FMUL_LEGACY:
     565             :   case AMDGPUISD::FMIN_LEGACY:
     566             :   case AMDGPUISD::FMAX_LEGACY:
     567             :   case AMDGPUISD::FMED3:
     568             :     return true;
     569         651 :   default:
     570         651 :     return false;
     571             :   }
     572             : }
     573             : 
     574             : /// \p returns true if the operation will definitely need to use a 64-bit
     575             : /// encoding, and thus will use a VOP3 encoding regardless of the source
     576             : /// modifiers.
     577             : LLVM_READONLY
     578             : static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
     579        2791 :   return N->getNumOperands() > 2 || VT == MVT::f64;
     580             : }
     581             : 
     582             : // Most FP instructions support source modifiers, but this could be refined
     583             : // slightly.
     584             : LLVM_READONLY
     585        3694 : static bool hasSourceMods(const SDNode *N) {
     586        3694 :   if (isa<MemSDNode>(N))
     587             :     return false;
     588             : 
     589        6536 :   switch (N->getOpcode()) {
     590             :   case ISD::CopyToReg:
     591             :   case ISD::SELECT:
     592             :   case ISD::FDIV:
     593             :   case ISD::FREM:
     594             :   case ISD::INLINEASM:
     595             :   case AMDGPUISD::INTERP_P1:
     596             :   case AMDGPUISD::INTERP_P2:
     597             :   case AMDGPUISD::DIV_SCALE:
     598             : 
     599             :   // TODO: Should really be looking at the users of the bitcast. These are
     600             :   // problematic because bitcasts are used to legalize all stores to integer
     601             :   // types.
     602             :   case ISD::BITCAST:
     603             :     return false;
     604        2791 :   default:
     605        2791 :     return true;
     606             :   }
     607             : }
     608             : 
     609        3559 : bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
     610             :                                                  unsigned CostThreshold) {
     611             :   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
     612             :   // it is truly free to use a source modifier in all cases. If there are
     613             :   // multiple users but for each one will necessitate using VOP3, there will be
     614             :   // a code size increase. Try to avoid increasing code size unless we know it
     615             :   // will save on the instruction count.
     616             :   unsigned NumMayIncreaseSize = 0;
     617        7118 :   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
     618             : 
     619             :   // XXX - Should this limit number of uses to check?
     620        5151 :   for (const SDNode *U : N->uses()) {
     621        3694 :     if (!hasSourceMods(U))
     622             :       return false;
     623             : 
     624             :     if (!opMustUseVOP3Encoding(U, VT)) {
     625        1431 :       if (++NumMayIncreaseSize > CostThreshold)
     626             :         return false;
     627             :     }
     628             :   }
     629             : 
     630             :   return true;
     631             : }
     632             : 
     633      159088 : MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
     634      159088 :   return MVT::i32;
     635             : }
     636             : 
     637        2742 : bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
     638        2742 :   return true;
     639             : }
     640             : 
     641             : // The backend supports 32 and 64 bit floating point immediates.
     642             : // FIXME: Why are we reporting vectors of FP immediates as legal?
     643           0 : bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     644           0 :   EVT ScalarVT = VT.getScalarType();
     645             :   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
     646           0 :          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
     647             : }
     648             : 
     649             : // We don't want to shrink f64 / f32 constants.
     650           0 : bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
     651           0 :   EVT ScalarVT = VT.getScalarType();
     652           0 :   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
     653             : }
     654             : 
     655        2251 : bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
     656             :                                                  ISD::LoadExtType,
     657             :                                                  EVT NewVT) const {
     658             : 
     659             :   unsigned NewSize = NewVT.getStoreSizeInBits();
     660             : 
     661             :   // If we are reducing to a 32-bit load, this is always better.
     662        2251 :   if (NewSize == 32)
     663             :     return true;
     664             : 
     665        3804 :   EVT OldVT = N->getValueType(0);
     666             :   unsigned OldSize = OldVT.getStoreSizeInBits();
     667             : 
     668             :   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
     669             :   // extloads, so doing one requires using a buffer_load. In cases where we
     670             :   // still couldn't use a scalar load, using the wider load shouldn't really
     671             :   // hurt anything.
     672             : 
     673             :   // If the old size already had to be an extload, there's no harm in continuing
     674             :   // to reduce the width.
     675        1902 :   return (OldSize < 32);
     676             : }
     677             : 
     678       18200 : bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
     679             :                                                    EVT CastTy) const {
     680             : 
     681             :   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
     682             : 
     683       20166 :   if (LoadTy.getScalarType() == MVT::i32)
     684       16234 :     return false;
     685             : 
     686             :   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
     687             :   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
     688             : 
     689        1966 :   return (LScalarSize < CastScalarSize) ||
     690        1966 :          (CastScalarSize >= 32);
     691             : }
     692             : 
     693             : // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
     694             : // profitable with the expansion for 64-bit since it's generally good to
     695             : // speculate things.
     696             : // FIXME: These should really have the size as a parameter.
     697          27 : bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
     698          27 :   return true;
     699             : }
     700             : 
     701          60 : bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
     702          60 :   return true;
     703             : }
     704             : 
     705     2586210 : bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
     706     5172420 :   switch (N->getOpcode()) {
     707             :     default:
     708             :     return false;
     709      120069 :     case ISD::EntryToken:
     710             :     case ISD::TokenFactor:
     711      120069 :       return true;
     712       22711 :     case ISD::INTRINSIC_WO_CHAIN:
     713             :     {
     714       45422 :       unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     715       22711 :       switch (IntrID) {
     716             :         default:
     717             :         return false;
     718         157 :         case Intrinsic::amdgcn_readfirstlane:
     719             :         case Intrinsic::amdgcn_readlane:
     720         157 :           return true;
     721             :       }
     722             :     }
     723             :     break;
     724             :     case ISD::LOAD:
     725             :     {
     726             :       const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
     727      171181 :       if (L->getMemOperand()->getAddrSpace()
     728             :       == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     729         162 :         return true;
     730             :       return false;
     731             :     }
     732             :     break;
     733             :   }
     734             : }
     735             : 
     736             : //===---------------------------------------------------------------------===//
     737             : // Target Properties
     738             : //===---------------------------------------------------------------------===//
     739             : 
     740        2086 : bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
     741             :   assert(VT.isFloatingPoint());
     742             : 
     743             :   // Packed operations do not have a fabs modifier.
     744         477 :   return VT == MVT::f32 || VT == MVT::f64 ||
     745         477 :          (Subtarget->has16BitInsts() && VT == MVT::f16);
     746             : }
     747             : 
     748        4490 : bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
     749             :   assert(VT.isFloatingPoint());
     750         651 :   return VT == MVT::f32 || VT == MVT::f64 ||
     751         651 :          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
     752         292 :          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
     753             : }
     754             : 
     755        5532 : bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
     756             :                                                          unsigned NumElem,
     757             :                                                          unsigned AS) const {
     758        5532 :   return true;
     759             : }
     760             : 
     761       33489 : bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
     762             :   // There are few operations which truly have vector input operands. Any vector
     763             :   // operation is going to involve operations on each component, and a
     764             :   // build_vector will be a copy per element, so it always makes sense to use a
     765             :   // build_vector input in place of the extracted element to avoid a copy into a
     766             :   // super register.
     767             :   //
     768             :   // We should probably only do this if all users are extracts only, but this
     769             :   // should be the common case.
     770       33489 :   return true;
     771             : }
     772             : 
     773       21675 : bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
     774             :   // Truncate is just accessing a subregister.
     775             : 
     776       21675 :   unsigned SrcSize = Source.getSizeInBits();
     777       21675 :   unsigned DestSize = Dest.getSizeInBits();
     778             : 
     779       21675 :   return DestSize < SrcSize && DestSize % 32 == 0 ;
     780             : }
     781             : 
     782        1215 : bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
     783             :   // Truncate is just accessing a subregister.
     784             : 
     785        1215 :   unsigned SrcSize = Source->getScalarSizeInBits();
     786        1215 :   unsigned DestSize = Dest->getScalarSizeInBits();
     787             : 
     788        1215 :   if (DestSize== 16 && Subtarget->has16BitInsts())
     789          30 :     return SrcSize >= 32;
     790             : 
     791        1185 :   return DestSize < SrcSize && DestSize % 32 == 0;
     792             : }
     793             : 
     794          91 : bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
     795          91 :   unsigned SrcSize = Src->getScalarSizeInBits();
     796          91 :   unsigned DestSize = Dest->getScalarSizeInBits();
     797             : 
     798          91 :   if (SrcSize == 16 && Subtarget->has16BitInsts())
     799           6 :     return DestSize >= 32;
     800             : 
     801          85 :   return SrcSize == 32 && DestSize == 64;
     802             : }
     803             : 
     804        8038 : bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
     805             :   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
     806             :   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
     807             :   // this will enable reducing 64-bit operations the 32-bit, which is always
     808             :   // good.
     809             : 
     810             :   if (Src == MVT::i16)
     811             :     return Dest == MVT::i32 ||Dest == MVT::i64 ;
     812             : 
     813             :   return Src == MVT::i32 && Dest == MVT::i64;
     814             : }
     815             : 
     816        6844 : bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
     817       13688 :   return isZExtFree(Val.getValueType(), VT2);
     818             : }
     819             : 
     820        8050 : bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
     821             :   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
     822             :   // limited number of native 64-bit operations. Shrinking an operation to fit
     823             :   // in a single 32-bit register should always be helpful. As currently used,
     824             :   // this is much less general than the name suggests, and is only used in
     825             :   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
     826             :   // not profitable, and may actually be harmful.
     827        8050 :   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
     828             : }
     829             : 
     830             : //===---------------------------------------------------------------------===//
     831             : // TargetLowering Callbacks
     832             : //===---------------------------------------------------------------------===//
     833             : 
     834        4214 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
     835             :                                                   bool IsVarArg) {
     836        4214 :   switch (CC) {
     837             :   case CallingConv::AMDGPU_KERNEL:
     838             :   case CallingConv::SPIR_KERNEL:
     839             :     llvm_unreachable("kernels should not be handled here");
     840             :   case CallingConv::AMDGPU_VS:
     841             :   case CallingConv::AMDGPU_GS:
     842             :   case CallingConv::AMDGPU_PS:
     843             :   case CallingConv::AMDGPU_CS:
     844             :   case CallingConv::AMDGPU_HS:
     845             :   case CallingConv::AMDGPU_ES:
     846             :   case CallingConv::AMDGPU_LS:
     847             :     return CC_AMDGPU;
     848        2465 :   case CallingConv::C:
     849             :   case CallingConv::Fast:
     850             :   case CallingConv::Cold:
     851        2465 :     return CC_AMDGPU_Func;
     852           0 :   default:
     853           0 :     report_fatal_error("Unsupported calling convention.");
     854             :   }
     855             : }
     856             : 
     857        6325 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
     858             :                                                     bool IsVarArg) {
     859        6325 :   switch (CC) {
     860             :   case CallingConv::AMDGPU_KERNEL:
     861             :   case CallingConv::SPIR_KERNEL:
     862             :     llvm_unreachable("kernels should not be handled here");
     863             :   case CallingConv::AMDGPU_VS:
     864             :   case CallingConv::AMDGPU_GS:
     865             :   case CallingConv::AMDGPU_PS:
     866             :   case CallingConv::AMDGPU_CS:
     867             :   case CallingConv::AMDGPU_HS:
     868             :   case CallingConv::AMDGPU_ES:
     869             :   case CallingConv::AMDGPU_LS:
     870             :     return RetCC_SI_Shader;
     871        4588 :   case CallingConv::C:
     872             :   case CallingConv::Fast:
     873             :   case CallingConv::Cold:
     874        4588 :     return RetCC_AMDGPU_Func;
     875           0 :   default:
     876           0 :     report_fatal_error("Unsupported calling convention.");
     877             :   }
     878             : }
     879             : 
     880             : /// The SelectionDAGBuilder will automatically promote function arguments
     881             : /// with illegal types.  However, this does not work for the AMDGPU targets
     882             : /// since the function arguments are stored in memory as these illegal types.
     883             : /// In order to handle this properly we need to get the original types sizes
     884             : /// from the LLVM IR Function and fixup the ISD:InputArg values before
     885             : /// passing them to AnalyzeFormalArguments()
     886             : 
     887             : /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
     888             : /// input values across multiple registers.  Each item in the Ins array
     889             : /// represents a single value that will be stored in registers.  Ins[x].VT is
     890             : /// the value type of the value that will be stored in the register, so
     891             : /// whatever SDNode we lower the argument to needs to be this type.
     892             : ///
     893             : /// In order to correctly lower the arguments we need to know the size of each
     894             : /// argument.  Since Ins[x].VT gives us the size of the register that will
     895             : /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
     896             : /// for the orignal function argument so that we can deduce the correct memory
     897             : /// type to use for Ins[x].  In most cases the correct memory type will be
     898             : /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
     899             : /// we have a kernel argument of type v8i8, this argument will be split into
     900             : /// 8 parts and each part will be represented by its own item in the Ins array.
     901             : /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
     902             : /// the argument before it was split.  From this, we deduce that the memory type
     903             : /// for each individual part is i8.  We pass the memory type as LocVT to the
     904             : /// calling convention analysis function and the register type (Ins[x].VT) as
     905             : /// the ValVT.
     906       18461 : void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
     907             :   CCState &State,
     908             :   const SmallVectorImpl<ISD::InputArg> &Ins) const {
     909       18461 :   const MachineFunction &MF = State.getMachineFunction();
     910       18461 :   const Function &Fn = MF.getFunction();
     911       18461 :   LLVMContext &Ctx = Fn.getParent()->getContext();
     912       18461 :   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
     913             :   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
     914             :   CallingConv::ID CC = Fn.getCallingConv();
     915             : 
     916       18461 :   unsigned MaxAlign = 1;
     917             :   uint64_t ExplicitArgOffset = 0;
     918       18461 :   const DataLayout &DL = Fn.getParent()->getDataLayout();
     919             : 
     920             :   unsigned InIndex = 0;
     921             : 
     922       59013 :   for (const Argument &Arg : Fn.args()) {
     923       40552 :     Type *BaseArgTy = Arg.getType();
     924       40552 :     unsigned Align = DL.getABITypeAlignment(BaseArgTy);
     925       40552 :     MaxAlign = std::max(Align, MaxAlign);
     926       40552 :     unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
     927             : 
     928       40552 :     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
     929       40552 :     ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
     930             : 
     931             :     // We're basically throwing away everything passed into us and starting over
     932             :     // to get accurate in-memory offsets. The "PartOffset" is completely useless
     933             :     // to us as computed in Ins.
     934             :     //
     935             :     // We also need to figure out what type legalization is trying to do to get
     936             :     // the correct memory offsets.
     937             : 
     938             :     SmallVector<EVT, 16> ValueVTs;
     939             :     SmallVector<uint64_t, 16> Offsets;
     940       40552 :     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
     941             : 
     942       85747 :     for (unsigned Value = 0, NumValues = ValueVTs.size();
     943       85747 :          Value != NumValues; ++Value) {
     944       90390 :       uint64_t BasePartOffset = Offsets[Value];
     945             : 
     946       45195 :       EVT ArgVT = ValueVTs[Value];
     947       45195 :       EVT MemVT = ArgVT;
     948       45195 :       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
     949       45195 :       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
     950             : 
     951       45195 :       if (NumRegs == 1) {
     952             :         // This argument is not split, so the IR type is the memory type.
     953       44391 :         if (ArgVT.isExtended()) {
     954             :           // We have an extended type, like i24, so we should just use the
     955             :           // register type.
     956         112 :           MemVT = RegisterVT;
     957             :         } else {
     958       44279 :           MemVT = ArgVT;
     959             :         }
     960         804 :       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
     961         804 :                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
     962             :         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
     963             :         // We have a vector value which has been split into a vector with
     964             :         // the same scalar type, but fewer elements.  This should handle
     965             :         // all the floating-point vector types.
     966         216 :         MemVT = RegisterVT;
     967         860 :       } else if (ArgVT.isVector() &&
     968             :                  ArgVT.getVectorNumElements() == NumRegs) {
     969             :         // This arg has been split so that each element is stored in a separate
     970             :         // register.
     971         263 :         MemVT = ArgVT.getScalarType();
     972         325 :       } else if (ArgVT.isExtended()) {
     973             :         // We have an extended type, like i65.
     974          25 :         MemVT = RegisterVT;
     975             :       } else {
     976         300 :         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
     977             :         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
     978         600 :         if (RegisterVT.isInteger()) {
     979         300 :           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
     980           0 :         } else if (RegisterVT.isVector()) {
     981             :           assert(!RegisterVT.getScalarType().isFloatingPoint());
     982             :           unsigned NumElements = RegisterVT.getVectorNumElements();
     983             :           assert(MemoryBits % NumElements == 0);
     984             :           // This vector type has been split into another vector type with
     985             :           // a different elements size.
     986             :           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
     987           0 :                                            MemoryBits / NumElements);
     988           0 :           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
     989             :         } else {
     990           0 :           llvm_unreachable("cannot deduce memory type.");
     991             :         }
     992             :       }
     993             : 
     994             :       // Convert one element vectors to scalar.
     995       46784 :       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
     996          50 :         MemVT = MemVT.getScalarType();
     997             : 
     998       45195 :       if (MemVT.isExtended()) {
     999             :         // This should really only happen if we have vec3 arguments
    1000             :         assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
    1001           0 :         MemVT = MemVT.getPow2VectorType(State.getContext());
    1002             :       }
    1003             : 
    1004             :       unsigned PartOffset = 0;
    1005       92426 :       for (unsigned i = 0; i != NumRegs; ++i) {
    1006       47231 :         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
    1007             :                                                BasePartOffset + PartOffset,
    1008             :                                                MemVT.getSimpleVT(),
    1009       47231 :                                                CCValAssign::Full));
    1010       47231 :         PartOffset += MemVT.getStoreSize();
    1011             :       }
    1012             :     }
    1013             :   }
    1014       18461 : }
    1015             : 
    1016       18482 : SDValue AMDGPUTargetLowering::LowerReturn(
    1017             :   SDValue Chain, CallingConv::ID CallConv,
    1018             :   bool isVarArg,
    1019             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1020             :   const SmallVectorImpl<SDValue> &OutVals,
    1021             :   const SDLoc &DL, SelectionDAG &DAG) const {
    1022             :   // FIXME: Fails for r600 tests
    1023             :   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
    1024             :   // "wave terminate should not have return values");
    1025       18482 :   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
    1026             : }
    1027             : 
    1028             : //===---------------------------------------------------------------------===//
    1029             : // Target specific lowering
    1030             : //===---------------------------------------------------------------------===//
    1031             : 
    1032             : /// Selects the correct CCAssignFn for a given CallingConvention value.
    1033        4202 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    1034             :                                                     bool IsVarArg) {
    1035        4202 :   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
    1036             : }
    1037             : 
    1038        6325 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
    1039             :                                                       bool IsVarArg) {
    1040        6325 :   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
    1041             : }
    1042             : 
    1043          35 : SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
    1044             :                                                   SelectionDAG &DAG,
    1045             :                                                   MachineFrameInfo &MFI,
    1046             :                                                   int ClobberedFI) const {
    1047             :   SmallVector<SDValue, 8> ArgChains;
    1048             :   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
    1049          35 :   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
    1050             : 
    1051             :   // Include the original chain at the beginning of the list. When this is
    1052             :   // used by target LowerCall hooks, this helps legalize find the
    1053             :   // CALLSEQ_BEGIN node.
    1054          35 :   ArgChains.push_back(Chain);
    1055             : 
    1056             :   // Add a chain value for each stack argument corresponding
    1057          35 :   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
    1058             :                             UE = DAG.getEntryNode().getNode()->use_end();
    1059        1190 :        U != UE; ++U) {
    1060             :     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
    1061             :       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
    1062          80 :         if (FI->getIndex() < 0) {
    1063             :           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
    1064             :           int64_t InLastByte = InFirstByte;
    1065          80 :           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
    1066             : 
    1067          80 :           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
    1068          52 :               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
    1069          28 :             ArgChains.push_back(SDValue(L, 1));
    1070             :         }
    1071             :       }
    1072             :     }
    1073             :   }
    1074             : 
    1075             :   // Build a tokenfactor for all the chains.
    1076          70 :   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
    1077             : }
    1078             : 
    1079          85 : SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
    1080             :                                                  SmallVectorImpl<SDValue> &InVals,
    1081             :                                                  StringRef Reason) const {
    1082          85 :   SDValue Callee = CLI.Callee;
    1083          85 :   SelectionDAG &DAG = CLI.DAG;
    1084             : 
    1085          85 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1086             : 
    1087             :   StringRef FuncName("<unknown>");
    1088             : 
    1089             :   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    1090          64 :     FuncName = G->getSymbol();
    1091             :   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    1092          19 :     FuncName = G->getGlobal()->getName();
    1093             : 
    1094             :   DiagnosticInfoUnsupported NoCalls(
    1095          85 :     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
    1096          85 :   DAG.getContext()->diagnose(NoCalls);
    1097             : 
    1098          84 :   if (!CLI.IsTailCall) {
    1099         155 :     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
    1100         150 :       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
    1101             :   }
    1102             : 
    1103          84 :   return DAG.getEntryNode();
    1104             : }
    1105             : 
    1106          78 : SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    1107             :                                         SmallVectorImpl<SDValue> &InVals) const {
    1108          78 :   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
    1109             : }
    1110             : 
    1111           3 : SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    1112             :                                                       SelectionDAG &DAG) const {
    1113           3 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1114             : 
    1115             :   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
    1116           3 :                                             SDLoc(Op).getDebugLoc());
    1117           3 :   DAG.getContext()->diagnose(NoDynamicAlloca);
    1118           9 :   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
    1119           6 :   return DAG.getMergeValues(Ops, SDLoc());
    1120             : }
    1121             : 
    1122       24615 : SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    1123             :                                              SelectionDAG &DAG) const {
    1124       24615 :   switch (Op.getOpcode()) {
    1125           0 :   default:
    1126           0 :     Op->print(errs(), &DAG);
    1127           0 :     llvm_unreachable("Custom lowering code for this"
    1128             :                      "instruction is not implemented yet!");
    1129             :     break;
    1130          16 :   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    1131        5046 :   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    1132       17301 :   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    1133         171 :   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    1134          72 :   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    1135          36 :   case ISD::FREM: return LowerFREM(Op, DAG);
    1136          31 :   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    1137          75 :   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    1138          14 :   case ISD::FRINT: return LowerFRINT(Op, DAG);
    1139          48 :   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    1140          93 :   case ISD::FROUND: return LowerFROUND(Op, DAG);
    1141           0 :   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    1142          37 :   case ISD::FLOG:
    1143          37 :     return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
    1144          37 :   case ISD::FLOG10:
    1145          37 :     return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
    1146          50 :   case ISD::FEXP:
    1147          50 :     return lowerFEXP(Op, DAG);
    1148          38 :   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    1149          47 :   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    1150         983 :   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
    1151          65 :   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    1152          37 :   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
    1153         415 :   case ISD::CTTZ:
    1154             :   case ISD::CTTZ_ZERO_UNDEF:
    1155             :   case ISD::CTLZ:
    1156             :   case ISD::CTLZ_ZERO_UNDEF:
    1157         415 :     return LowerCTLZ_CTTZ(Op, DAG);
    1158           3 :   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    1159             :   }
    1160             :   return Op;
    1161             : }
    1162             : 
    1163          43 : void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    1164             :                                               SmallVectorImpl<SDValue> &Results,
    1165             :                                               SelectionDAG &DAG) const {
    1166             :   switch (N->getOpcode()) {
    1167             :   case ISD::SIGN_EXTEND_INREG:
    1168             :     // Different parts of legalization seem to interpret which type of
    1169             :     // sign_extend_inreg is the one to check for custom lowering. The extended
    1170             :     // from type is what really matters, but some places check for custom
    1171             :     // lowering of the result type. This results in trying to use
    1172             :     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    1173             :     // nothing here and let the illegal result integer be handled normally.
    1174             :     return;
    1175             :   default:
    1176             :     return;
    1177             :   }
    1178             : }
    1179             : 
    1180             : static bool hasDefinedInitializer(const GlobalValue *GV) {
    1181             :   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
    1182         455 :   if (!GVar || !GVar->hasInitializer())
    1183             :     return false;
    1184             : 
    1185         437 :   return !isa<UndefValue>(GVar->getInitializer());
    1186             : }
    1187             : 
    1188         455 : SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    1189             :                                                  SDValue Op,
    1190             :                                                  SelectionDAG &DAG) const {
    1191             : 
    1192         455 :   const DataLayout &DL = DAG.getDataLayout();
    1193             :   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    1194         455 :   const GlobalValue *GV = G->getGlobal();
    1195             : 
    1196         455 :   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
    1197           0 :       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
    1198         455 :     if (!MFI->isEntryFunction()) {
    1199           1 :       const Function &Fn = DAG.getMachineFunction().getFunction();
    1200             :       DiagnosticInfoUnsupported BadLDSDecl(
    1201           1 :         Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
    1202           1 :       DAG.getContext()->diagnose(BadLDSDecl);
    1203             :     }
    1204             : 
    1205             :     // XXX: What does the value of G->getOffset() mean?
    1206             :     assert(G->getOffset() == 0 &&
    1207             :          "Do not know what to do with an non-zero offset");
    1208             : 
    1209             :     // TODO: We could emit code to handle the initialization somewhere.
    1210         437 :     if (!hasDefinedInitializer(GV)) {
    1211         447 :       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
    1212         894 :       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
    1213             :     }
    1214             :   }
    1215             : 
    1216           8 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1217             :   DiagnosticInfoUnsupported BadInit(
    1218           8 :       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
    1219           8 :   DAG.getContext()->diagnose(BadInit);
    1220           8 :   return SDValue();
    1221             : }
    1222             : 
    1223        5046 : SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    1224             :                                                   SelectionDAG &DAG) const {
    1225             :   SmallVector<SDValue, 8> Args;
    1226             : 
    1227        5046 :   EVT VT = Op.getValueType();
    1228             :   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
    1229             :     SDLoc SL(Op);
    1230         142 :     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
    1231         142 :     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
    1232             : 
    1233         284 :     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
    1234         142 :     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
    1235             :   }
    1236             : 
    1237       14740 :   for (const SDUse &U : Op->ops())
    1238        9836 :     DAG.ExtractVectorElements(U.get(), Args);
    1239             : 
    1240        9808 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1241             : }
    1242             : 
    1243       17301 : SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    1244             :                                                      SelectionDAG &DAG) const {
    1245             : 
    1246             :   SmallVector<SDValue, 8> Args;
    1247       34602 :   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    1248       17301 :   EVT VT = Op.getValueType();
    1249       17301 :   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    1250             :                             VT.getVectorNumElements());
    1251             : 
    1252       34602 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1253             : }
    1254             : 
    1255             : /// Generate Min/Max node
    1256         781 : SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
    1257             :                                                    SDValue LHS, SDValue RHS,
    1258             :                                                    SDValue True, SDValue False,
    1259             :                                                    SDValue CC,
    1260             :                                                    DAGCombinerInfo &DCI) const {
    1261             :   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
    1262         647 :     return SDValue();
    1263             : 
    1264         134 :   SelectionDAG &DAG = DCI.DAG;
    1265         134 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    1266         134 :   switch (CCOpcode) {
    1267             :   case ISD::SETOEQ:
    1268             :   case ISD::SETONE:
    1269             :   case ISD::SETUNE:
    1270             :   case ISD::SETNE:
    1271             :   case ISD::SETUEQ:
    1272             :   case ISD::SETEQ:
    1273             :   case ISD::SETFALSE:
    1274             :   case ISD::SETFALSE2:
    1275             :   case ISD::SETTRUE:
    1276             :   case ISD::SETTRUE2:
    1277             :   case ISD::SETUO:
    1278             :   case ISD::SETO:
    1279             :     break;
    1280          33 :   case ISD::SETULE:
    1281             :   case ISD::SETULT: {
    1282             :     if (LHS == True)
    1283          33 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1284           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1285             :   }
    1286           4 :   case ISD::SETOLE:
    1287             :   case ISD::SETOLT:
    1288             :   case ISD::SETLE:
    1289             :   case ISD::SETLT: {
    1290             :     // Ordered. Assume ordered for undefined.
    1291             : 
    1292             :     // Only do this after legalization to avoid interfering with other combines
    1293             :     // which might occur.
    1294           4 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1295           2 :         !DCI.isCalledByLegalizer())
    1296           2 :       return SDValue();
    1297             : 
    1298             :     // We need to permute the operands to get the correct NaN behavior. The
    1299             :     // selected operand is the second one based on the failing compare with NaN,
    1300             :     // so permute it based on the compare type the hardware uses.
    1301             :     if (LHS == True)
    1302           2 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1303           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1304             :   }
    1305          30 :   case ISD::SETUGE:
    1306             :   case ISD::SETUGT: {
    1307             :     if (LHS == True)
    1308          29 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1309           1 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1310             :   }
    1311          41 :   case ISD::SETGT:
    1312             :   case ISD::SETGE:
    1313             :   case ISD::SETOGE:
    1314             :   case ISD::SETOGT: {
    1315          41 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1316          35 :         !DCI.isCalledByLegalizer())
    1317          14 :       return SDValue();
    1318             : 
    1319             :     if (LHS == True)
    1320          17 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1321          10 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1322             :   }
    1323             :   case ISD::SETCC_INVALID:
    1324             :     llvm_unreachable("Invalid setcc condcode!");
    1325             :   }
    1326          26 :   return SDValue();
    1327             : }
    1328             : 
    1329             : std::pair<SDValue, SDValue>
    1330        2907 : AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
    1331             :   SDLoc SL(Op);
    1332             : 
    1333        2907 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1334             : 
    1335        2907 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1336        2907 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1337             : 
    1338        2907 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1339        2907 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1340             : 
    1341        2907 :   return std::make_pair(Lo, Hi);
    1342             : }
    1343             : 
    1344           0 : SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
    1345             :   SDLoc SL(Op);
    1346             : 
    1347           0 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1348           0 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1349           0 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1350             : }
    1351             : 
    1352         138 : SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
    1353             :   SDLoc SL(Op);
    1354             : 
    1355         138 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1356         138 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1357         138 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1358             : }
    1359             : 
    1360        2512 : SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
    1361             :                                               SelectionDAG &DAG) const {
    1362             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1363        5024 :   EVT VT = Op.getValueType();
    1364             : 
    1365             : 
    1366             :   // If this is a 2 element vector, we really want to scalarize and not create
    1367             :   // weird 1 element vectors.
    1368        2512 :   if (VT.getVectorNumElements() == 2)
    1369          19 :     return scalarizeVectorLoad(Load, DAG);
    1370             : 
    1371        2493 :   SDValue BasePtr = Load->getBasePtr();
    1372        2493 :   EVT MemVT = Load->getMemoryVT();
    1373             :   SDLoc SL(Op);
    1374             : 
    1375        2493 :   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
    1376             : 
    1377        2493 :   EVT LoVT, HiVT;
    1378        2493 :   EVT LoMemVT, HiMemVT;
    1379             :   SDValue Lo, Hi;
    1380             : 
    1381        2493 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1382        2493 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1383        2493 :   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
    1384             : 
    1385             :   unsigned Size = LoMemVT.getStoreSize();
    1386        2493 :   unsigned BaseAlign = Load->getAlignment();
    1387        4986 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1388             : 
    1389             :   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
    1390             :                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
    1391        4986 :                                   BaseAlign, Load->getMemOperand()->getFlags());
    1392        2493 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
    1393             :   SDValue HiLoad =
    1394             :       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
    1395             :                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
    1396        4986 :                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
    1397             : 
    1398             :   SDValue Ops[] = {
    1399        2493 :     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
    1400             :     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    1401        4986 :                 LoLoad.getValue(1), HiLoad.getValue(1))
    1402             :   };
    1403             : 
    1404        2493 :   return DAG.getMergeValues(Ops, SL);
    1405             : }
    1406             : 
    1407        8472 : SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
    1408             :                                                SelectionDAG &DAG) const {
    1409             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    1410        8472 :   SDValue Val = Store->getValue();
    1411       16944 :   EVT VT = Val.getValueType();
    1412             : 
    1413             :   // If this is a 2 element vector, we really want to scalarize and not create
    1414             :   // weird 1 element vectors.
    1415        8472 :   if (VT.getVectorNumElements() == 2)
    1416          33 :     return scalarizeVectorStore(Store, DAG);
    1417             : 
    1418        8439 :   EVT MemVT = Store->getMemoryVT();
    1419        8439 :   SDValue Chain = Store->getChain();
    1420        8439 :   SDValue BasePtr = Store->getBasePtr();
    1421             :   SDLoc SL(Op);
    1422             : 
    1423        8439 :   EVT LoVT, HiVT;
    1424        8439 :   EVT LoMemVT, HiMemVT;
    1425             :   SDValue Lo, Hi;
    1426             : 
    1427        8439 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1428        8439 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1429        8439 :   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
    1430             : 
    1431        8439 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
    1432             : 
    1433        8439 :   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
    1434             :   unsigned BaseAlign = Store->getAlignment();
    1435             :   unsigned Size = LoMemVT.getStoreSize();
    1436       16878 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1437             : 
    1438             :   SDValue LoStore =
    1439             :       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
    1440        8439 :                         Store->getMemOperand()->getFlags());
    1441             :   SDValue HiStore =
    1442             :       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
    1443        8439 :                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
    1444             : 
    1445        8439 :   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
    1446             : }
    1447             : 
    1448             : // This is a shortcut for integer division because we have fast i32<->f32
    1449             : // conversions, and fast f32 reciprocal instructions. The fractional part of a
    1450             : // float is enough to accurately represent up to a 24-bit signed integer.
    1451         169 : SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
    1452             :                                             bool Sign) const {
    1453             :   SDLoc DL(Op);
    1454         169 :   EVT VT = Op.getValueType();
    1455         169 :   SDValue LHS = Op.getOperand(0);
    1456         169 :   SDValue RHS = Op.getOperand(1);
    1457             :   MVT IntVT = MVT::i32;
    1458             :   MVT FltVT = MVT::f32;
    1459             : 
    1460         169 :   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
    1461         169 :   if (LHSSignBits < 9)
    1462         129 :     return SDValue();
    1463             : 
    1464          40 :   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
    1465          40 :   if (RHSSignBits < 9)
    1466           4 :     return SDValue();
    1467             : 
    1468          36 :   unsigned BitSize = VT.getSizeInBits();
    1469          36 :   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
    1470          36 :   unsigned DivBits = BitSize - SignBits;
    1471          36 :   if (Sign)
    1472          18 :     ++DivBits;
    1473             : 
    1474          36 :   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
    1475          36 :   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
    1476             : 
    1477          36 :   SDValue jq = DAG.getConstant(1, DL, IntVT);
    1478             : 
    1479          36 :   if (Sign) {
    1480             :     // char|short jq = ia ^ ib;
    1481          18 :     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
    1482             : 
    1483             :     // jq = jq >> (bitsize - 2)
    1484          18 :     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
    1485          18 :                      DAG.getConstant(BitSize - 2, DL, VT));
    1486             : 
    1487             :     // jq = jq | 0x1
    1488          18 :     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
    1489             :   }
    1490             : 
    1491             :   // int ia = (int)LHS;
    1492          36 :   SDValue ia = LHS;
    1493             : 
    1494             :   // int ib, (int)RHS;
    1495          36 :   SDValue ib = RHS;
    1496             : 
    1497             :   // float fa = (float)ia;
    1498          36 :   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
    1499             : 
    1500             :   // float fb = (float)ib;
    1501          36 :   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
    1502             : 
    1503             :   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
    1504          36 :                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
    1505             : 
    1506             :   // fq = trunc(fq);
    1507          36 :   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
    1508             : 
    1509             :   // float fqneg = -fq;
    1510          36 :   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
    1511             : 
    1512             :   // float fr = mad(fqneg, fb, fa);
    1513          36 :   unsigned OpCode = Subtarget->hasFP32Denormals() ?
    1514             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1515             :                     (unsigned)ISD::FMAD;
    1516          36 :   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
    1517             : 
    1518             :   // int iq = (int)fq;
    1519          36 :   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
    1520             : 
    1521             :   // fr = fabs(fr);
    1522          36 :   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
    1523             : 
    1524             :   // fb = fabs(fb);
    1525          36 :   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
    1526             : 
    1527          36 :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    1528             : 
    1529             :   // int cv = fr >= fb;
    1530          36 :   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
    1531             : 
    1532             :   // jq = (cv ? jq : 0);
    1533          36 :   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
    1534             : 
    1535             :   // dst = iq + jq;
    1536          36 :   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
    1537             : 
    1538             :   // Rem needs compensation, it's easier to recompute it
    1539          36 :   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
    1540          36 :   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
    1541             : 
    1542             :   // Truncate to number of bits this divide really is.
    1543          36 :   if (Sign) {
    1544             :     SDValue InRegSize
    1545          18 :       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
    1546          18 :     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
    1547          18 :     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
    1548             :   } else {
    1549          18 :     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
    1550          18 :     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
    1551          18 :     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
    1552             :   }
    1553             : 
    1554          72 :   return DAG.getMergeValues({ Div, Rem }, DL);
    1555             : }
    1556             : 
    1557          72 : void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
    1558             :                                       SelectionDAG &DAG,
    1559             :                                       SmallVectorImpl<SDValue> &Results) const {
    1560             :   SDLoc DL(Op);
    1561          72 :   EVT VT = Op.getValueType();
    1562             : 
    1563             :   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
    1564             : 
    1565          72 :   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1566             : 
    1567          72 :   SDValue One = DAG.getConstant(1, DL, HalfVT);
    1568          72 :   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
    1569             : 
    1570             :   //HiLo split
    1571          72 :   SDValue LHS = Op.getOperand(0);
    1572          72 :   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1573          72 :   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
    1574             : 
    1575          72 :   SDValue RHS = Op.getOperand(1);
    1576          72 :   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1577          72 :   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
    1578             : 
    1579         160 :   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
    1580          88 :       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
    1581             : 
    1582             :     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1583          16 :                               LHS_Lo, RHS_Lo);
    1584             : 
    1585          32 :     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
    1586          32 :     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
    1587             : 
    1588          32 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
    1589          16 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
    1590             :     return;
    1591             :   }
    1592             : 
    1593          18 :   if (isTypeLegal(MVT::i64)) {
    1594             :     // Compute denominator reciprocal.
    1595          38 :     unsigned FMAD = Subtarget->hasFP32Denormals() ?
    1596             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1597             :                     (unsigned)ISD::FMAD;
    1598             : 
    1599          38 :     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
    1600          38 :     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
    1601             :     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
    1602          38 :       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
    1603          38 :       Cvt_Lo);
    1604          38 :     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
    1605             :     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
    1606          38 :       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
    1607             :     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
    1608          38 :       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
    1609          38 :     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
    1610             :     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
    1611          38 :       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
    1612          38 :       Mul1);
    1613          38 :     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
    1614          38 :     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
    1615             :     SDValue Rcp64 = DAG.getBitcast(VT,
    1616          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
    1617             : 
    1618          38 :     SDValue Zero64 = DAG.getConstant(0, DL, VT);
    1619          38 :     SDValue One64  = DAG.getConstant(1, DL, VT);
    1620          38 :     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
    1621          38 :     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
    1622             : 
    1623          38 :     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
    1624          38 :     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
    1625          38 :     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
    1626             :     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1627          38 :                                     Zero);
    1628             :     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1629          38 :                                     One);
    1630             : 
    1631             :     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
    1632          38 :                                   Mulhi1_Lo, Zero1);
    1633             :     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
    1634          38 :                                   Mulhi1_Hi, Add1_Lo.getValue(1));
    1635          38 :     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
    1636             :     SDValue Add1 = DAG.getBitcast(VT,
    1637          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
    1638             : 
    1639          38 :     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
    1640          38 :     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
    1641             :     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1642          38 :                                     Zero);
    1643             :     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1644          38 :                                     One);
    1645             : 
    1646             :     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
    1647          38 :                                   Mulhi2_Lo, Zero1);
    1648             :     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
    1649          38 :                                    Mulhi2_Hi, Add1_Lo.getValue(1));
    1650             :     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
    1651          38 :                                   Zero, Add2_Lo.getValue(1));
    1652             :     SDValue Add2 = DAG.getBitcast(VT,
    1653          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
    1654          38 :     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
    1655             : 
    1656          38 :     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
    1657             : 
    1658          38 :     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
    1659          38 :     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
    1660             :     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
    1661          38 :                                   Mul3_Lo, Zero1);
    1662             :     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
    1663          38 :                                   Mul3_Hi, Sub1_Lo.getValue(1));
    1664          38 :     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
    1665             :     SDValue Sub1 = DAG.getBitcast(VT,
    1666          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
    1667             : 
    1668          38 :     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
    1669             :     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
    1670          38 :                                  ISD::SETUGE);
    1671             :     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
    1672          38 :                                  ISD::SETUGE);
    1673          38 :     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
    1674             : 
    1675             :     // TODO: Here and below portions of the code can be enclosed into if/endif.
    1676             :     // Currently control flow is unconditional and we have 4 selects after
    1677             :     // potential endif to substitute PHIs.
    1678             : 
    1679             :     // if C3 != 0 ...
    1680             :     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
    1681          38 :                                   RHS_Lo, Zero1);
    1682             :     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
    1683          38 :                                   RHS_Hi, Sub1_Lo.getValue(1));
    1684             :     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1685          38 :                                   Zero, Sub2_Lo.getValue(1));
    1686             :     SDValue Sub2 = DAG.getBitcast(VT,
    1687          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
    1688             : 
    1689          38 :     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
    1690             : 
    1691             :     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
    1692          38 :                                  ISD::SETUGE);
    1693             :     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
    1694          38 :                                  ISD::SETUGE);
    1695          38 :     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
    1696             : 
    1697             :     // if (C6 != 0)
    1698          38 :     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
    1699             : 
    1700             :     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
    1701          38 :                                   RHS_Lo, Zero1);
    1702             :     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1703          38 :                                   RHS_Hi, Sub2_Lo.getValue(1));
    1704             :     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
    1705          38 :                                   Zero, Sub3_Lo.getValue(1));
    1706             :     SDValue Sub3 = DAG.getBitcast(VT,
    1707          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
    1708             : 
    1709             :     // endif C6
    1710             :     // endif C3
    1711             : 
    1712          38 :     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
    1713          38 :     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
    1714             : 
    1715          38 :     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
    1716          38 :     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
    1717             : 
    1718          38 :     Results.push_back(Div);
    1719          38 :     Results.push_back(Rem);
    1720             : 
    1721             :     return;
    1722             :   }
    1723             : 
    1724             :   // r600 expandion.
    1725             :   // Get Speculative values
    1726          18 :   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    1727          18 :   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
    1728             : 
    1729          18 :   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
    1730          36 :   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
    1731          18 :   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
    1732             : 
    1733          18 :   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
    1734          18 :   SDValue DIV_Lo = Zero;
    1735             : 
    1736          18 :   const unsigned halfBitWidth = HalfVT.getSizeInBits();
    1737             : 
    1738         594 :   for (unsigned i = 0; i < halfBitWidth; ++i) {
    1739         576 :     const unsigned bitPos = halfBitWidth - i - 1;
    1740         576 :     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
    1741             :     // Get value of high bit
    1742         576 :     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
    1743         576 :     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
    1744         576 :     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
    1745             : 
    1746             :     // Shift
    1747         576 :     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
    1748             :     // Add LHS high bit
    1749         576 :     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
    1750             : 
    1751         576 :     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
    1752         576 :     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
    1753             : 
    1754         576 :     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
    1755             : 
    1756             :     // Update REM
    1757         576 :     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
    1758         576 :     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
    1759             :   }
    1760             : 
    1761          36 :   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
    1762          18 :   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
    1763          18 :   Results.push_back(DIV);
    1764          18 :   Results.push_back(REM);
    1765             : }
    1766             : 
    1767         171 : SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
    1768             :                                            SelectionDAG &DAG) const {
    1769             :   SDLoc DL(Op);
    1770         171 :   EVT VT = Op.getValueType();
    1771             : 
    1772             :   if (VT == MVT::i64) {
    1773             :     SmallVector<SDValue, 2> Results;
    1774          50 :     LowerUDIVREM64(Op, DAG, Results);
    1775          50 :     return DAG.getMergeValues(Results, DL);
    1776             :   }
    1777             : 
    1778             :   if (VT == MVT::i32) {
    1779         121 :     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
    1780          18 :       return Res;
    1781             :   }
    1782             : 
    1783         103 :   SDValue Num = Op.getOperand(0);
    1784         103 :   SDValue Den = Op.getOperand(1);
    1785             : 
    1786             :   // RCP =  URECIP(Den) = 2^32 / Den + e
    1787             :   // e is rounding error.
    1788         103 :   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
    1789             : 
    1790             :   // RCP_LO = mul(RCP, Den) */
    1791         103 :   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
    1792             : 
    1793             :   // RCP_HI = mulhu (RCP, Den) */
    1794         103 :   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
    1795             : 
    1796             :   // NEG_RCP_LO = -RCP_LO
    1797             :   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
    1798         103 :                                                      RCP_LO);
    1799             : 
    1800             :   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
    1801             :   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1802             :                                            NEG_RCP_LO, RCP_LO,
    1803         103 :                                            ISD::SETEQ);
    1804             :   // Calculate the rounding error from the URECIP instruction
    1805             :   // E = mulhu(ABS_RCP_LO, RCP)
    1806         103 :   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
    1807             : 
    1808             :   // RCP_A_E = RCP + E
    1809         103 :   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
    1810             : 
    1811             :   // RCP_S_E = RCP - E
    1812         103 :   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
    1813             : 
    1814             :   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
    1815             :   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1816             :                                      RCP_A_E, RCP_S_E,
    1817         103 :                                      ISD::SETEQ);
    1818             :   // Quotient = mulhu(Tmp0, Num)
    1819         103 :   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
    1820             : 
    1821             :   // Num_S_Remainder = Quotient * Den
    1822         103 :   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
    1823             : 
    1824             :   // Remainder = Num - Num_S_Remainder
    1825         103 :   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
    1826             : 
    1827             :   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
    1828             :   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
    1829             :                                                  DAG.getConstant(-1, DL, VT),
    1830             :                                                  DAG.getConstant(0, DL, VT),
    1831         103 :                                                  ISD::SETUGE);
    1832             :   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
    1833             :   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
    1834             :                                                   Num_S_Remainder,
    1835             :                                                   DAG.getConstant(-1, DL, VT),
    1836             :                                                   DAG.getConstant(0, DL, VT),
    1837         103 :                                                   ISD::SETUGE);
    1838             :   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
    1839             :   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
    1840         103 :                                                Remainder_GE_Zero);
    1841             : 
    1842             :   // Calculate Division result:
    1843             : 
    1844             :   // Quotient_A_One = Quotient + 1
    1845             :   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
    1846         103 :                                        DAG.getConstant(1, DL, VT));
    1847             : 
    1848             :   // Quotient_S_One = Quotient - 1
    1849             :   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
    1850         103 :                                        DAG.getConstant(1, DL, VT));
    1851             : 
    1852             :   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
    1853             :   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1854         103 :                                      Quotient, Quotient_A_One, ISD::SETEQ);
    1855             : 
    1856             :   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
    1857         103 :   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1858         103 :                             Quotient_S_One, Div, ISD::SETEQ);
    1859             : 
    1860             :   // Calculate Rem result:
    1861             : 
    1862             :   // Remainder_S_Den = Remainder - Den
    1863         103 :   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
    1864             : 
    1865             :   // Remainder_A_Den = Remainder + Den
    1866         103 :   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
    1867             : 
    1868             :   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
    1869             :   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1870         103 :                                     Remainder, Remainder_S_Den, ISD::SETEQ);
    1871             : 
    1872             :   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
    1873         103 :   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1874         103 :                             Remainder_A_Den, Rem, ISD::SETEQ);
    1875             :   SDValue Ops[2] = {
    1876             :     Div,
    1877             :     Rem
    1878         103 :   };
    1879         103 :   return DAG.getMergeValues(Ops, DL);
    1880             : }
    1881             : 
    1882          84 : SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
    1883             :                                            SelectionDAG &DAG) const {
    1884             :   SDLoc DL(Op);
    1885          84 :   EVT VT = Op.getValueType();
    1886             : 
    1887          84 :   SDValue LHS = Op.getOperand(0);
    1888          84 :   SDValue RHS = Op.getOperand(1);
    1889             : 
    1890          84 :   SDValue Zero = DAG.getConstant(0, DL, VT);
    1891          84 :   SDValue NegOne = DAG.getConstant(-1, DL, VT);
    1892             : 
    1893             :   if (VT == MVT::i32) {
    1894          48 :     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
    1895          18 :       return Res;
    1896             :   }
    1897             : 
    1898          36 :   if (VT == MVT::i64 &&
    1899          48 :       DAG.ComputeNumSignBits(LHS) > 32 &&
    1900          12 :       DAG.ComputeNumSignBits(RHS) > 32) {
    1901          12 :     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1902             : 
    1903             :     //HiLo split
    1904          12 :     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1905          12 :     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1906             :     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1907          12 :                                  LHS_Lo, RHS_Lo);
    1908             :     SDValue Res[2] = {
    1909          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
    1910          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
    1911             :     };
    1912          12 :     return DAG.getMergeValues(Res, DL);
    1913             :   }
    1914             : 
    1915          54 :   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
    1916          54 :   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
    1917          54 :   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
    1918          54 :   SDValue RSign = LHSign; // Remainder sign is the same as LHS
    1919             : 
    1920          54 :   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
    1921          54 :   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
    1922             : 
    1923          54 :   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
    1924          54 :   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
    1925             : 
    1926          54 :   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
    1927          54 :   SDValue Rem = Div.getValue(1);
    1928             : 
    1929          54 :   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
    1930          54 :   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
    1931             : 
    1932          54 :   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
    1933          54 :   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
    1934             : 
    1935             :   SDValue Res[2] = {
    1936             :     Div,
    1937             :     Rem
    1938          54 :   };
    1939          54 :   return DAG.getMergeValues(Res, DL);
    1940             : }
    1941             : 
    1942             : // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
    1943          36 : SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
    1944             :   SDLoc SL(Op);
    1945          36 :   EVT VT = Op.getValueType();
    1946          36 :   SDValue X = Op.getOperand(0);
    1947          36 :   SDValue Y = Op.getOperand(1);
    1948             : 
    1949             :   // TODO: Should this propagate fast-math-flags?
    1950             : 
    1951          36 :   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
    1952          36 :   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
    1953          36 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
    1954             : 
    1955          36 :   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
    1956             : }
    1957             : 
    1958          31 : SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
    1959             :   SDLoc SL(Op);
    1960          31 :   SDValue Src = Op.getOperand(0);
    1961             : 
    1962             :   // result = trunc(src)
    1963             :   // if (src > 0.0 && src != result)
    1964             :   //   result += 1.0
    1965             : 
    1966          31 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    1967             : 
    1968          31 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    1969          31 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    1970             : 
    1971             :   EVT SetCCVT =
    1972          62 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    1973             : 
    1974          31 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
    1975          31 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    1976          31 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    1977             : 
    1978          31 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
    1979             :   // TODO: Should this propagate fast-math-flags?
    1980          31 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    1981             : }
    1982             : 
    1983          91 : static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
    1984             :                                   SelectionDAG &DAG) {
    1985             :   const unsigned FractBits = 52;
    1986             :   const unsigned ExpBits = 11;
    1987             : 
    1988             :   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    1989             :                                 Hi,
    1990             :                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
    1991          91 :                                 DAG.getConstant(ExpBits, SL, MVT::i32));
    1992             :   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
    1993          91 :                             DAG.getConstant(1023, SL, MVT::i32));
    1994             : 
    1995          91 :   return Exp;
    1996             : }
    1997             : 
    1998          75 : SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    1999             :   SDLoc SL(Op);
    2000          75 :   SDValue Src = Op.getOperand(0);
    2001             : 
    2002             :   assert(Op.getValueType() == MVT::f64);
    2003             : 
    2004          75 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2005          75 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2006             : 
    2007          75 :   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2008             : 
    2009             :   // Extract the upper half, since this is where we will find the sign and
    2010             :   // exponent.
    2011          75 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
    2012             : 
    2013          75 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    2014             : 
    2015             :   const unsigned FractBits = 52;
    2016             : 
    2017             :   // Extract the sign bit.
    2018          75 :   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
    2019          75 :   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
    2020             : 
    2021             :   // Extend back to 64-bits.
    2022         150 :   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
    2023          75 :   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
    2024             : 
    2025          75 :   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
    2026             :   const SDValue FractMask
    2027          75 :     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
    2028             : 
    2029          75 :   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
    2030          75 :   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
    2031          75 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
    2032             : 
    2033             :   EVT SetCCVT =
    2034         150 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2035             : 
    2036          75 :   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
    2037             : 
    2038          75 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2039          75 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2040             : 
    2041          75 :   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
    2042          75 :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
    2043             : 
    2044          75 :   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
    2045             : }
    2046             : 
    2047          14 : SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
    2048             :   SDLoc SL(Op);
    2049          14 :   SDValue Src = Op.getOperand(0);
    2050             : 
    2051             :   assert(Op.getValueType() == MVT::f64);
    2052             : 
    2053          14 :   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
    2054          14 :   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
    2055          14 :   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
    2056             : 
    2057             :   // TODO: Should this propagate fast-math-flags?
    2058             : 
    2059          14 :   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
    2060          14 :   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
    2061             : 
    2062          14 :   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
    2063             : 
    2064          14 :   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
    2065          14 :   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
    2066             : 
    2067             :   EVT SetCCVT =
    2068          28 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2069          14 :   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
    2070             : 
    2071          14 :   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
    2072             : }
    2073             : 
    2074          48 : SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
    2075             :   // FNEARBYINT and FRINT are the same, except in their handling of FP
    2076             :   // exceptions. Those aren't really meaningful for us, and OpenCL only has
    2077             :   // rint, so just treat them as equivalent.
    2078          48 :   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
    2079             : }
    2080             : 
    2081             : // XXX - May require not supporting f32 denormals?
    2082             : 
    2083             : // Don't handle v2f16. The extra instructions to scalarize and repack around the
    2084             : // compare and vselect end up producing worse code than scalarizing the whole
    2085             : // operation.
    2086          77 : SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
    2087             :   SDLoc SL(Op);
    2088          77 :   SDValue X = Op.getOperand(0);
    2089          77 :   EVT VT = Op.getValueType();
    2090             : 
    2091          77 :   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
    2092             : 
    2093             :   // TODO: Should this propagate fast-math-flags?
    2094             : 
    2095          77 :   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
    2096             : 
    2097          77 :   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
    2098             : 
    2099          77 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
    2100          77 :   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
    2101          77 :   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
    2102             : 
    2103          77 :   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
    2104             : 
    2105             :   EVT SetCCVT =
    2106          77 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    2107             : 
    2108          77 :   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
    2109             : 
    2110          77 :   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
    2111             : 
    2112          77 :   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
    2113             : }
    2114             : 
    2115          16 : SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
    2116             :   SDLoc SL(Op);
    2117          16 :   SDValue X = Op.getOperand(0);
    2118             : 
    2119          16 :   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
    2120             : 
    2121          16 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2122          16 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2123          16 :   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
    2124          16 :   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
    2125             :   EVT SetCCVT =
    2126          32 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2127             : 
    2128          16 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    2129             : 
    2130          16 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
    2131             : 
    2132          16 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    2133             : 
    2134             :   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
    2135          16 :                                        MVT::i64);
    2136             : 
    2137          16 :   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
    2138             :   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
    2139             :                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
    2140             :                                           MVT::i64),
    2141          16 :                           Exp);
    2142             : 
    2143          16 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
    2144             :   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
    2145             :                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
    2146          16 :                               ISD::SETNE);
    2147             : 
    2148             :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
    2149          16 :                              D, DAG.getConstant(0, SL, MVT::i64));
    2150          16 :   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
    2151             : 
    2152          16 :   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
    2153          16 :   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
    2154             : 
    2155          16 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2156          16 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2157          16 :   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
    2158             : 
    2159             :   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
    2160             :                             ExpEqNegOne,
    2161             :                             DAG.getConstantFP(1.0, SL, MVT::f64),
    2162          16 :                             DAG.getConstantFP(0.0, SL, MVT::f64));
    2163             : 
    2164          16 :   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
    2165             : 
    2166          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
    2167          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
    2168             : 
    2169          16 :   return K;
    2170             : }
    2171             : 
    2172          93 : SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
    2173          93 :   EVT VT = Op.getValueType();
    2174             : 
    2175             :   if (VT == MVT::f32 || VT == MVT::f16)
    2176          77 :     return LowerFROUND32_16(Op, DAG);
    2177             : 
    2178             :   if (VT == MVT::f64)
    2179          16 :     return LowerFROUND64(Op, DAG);
    2180             : 
    2181           0 :   llvm_unreachable("unhandled type");
    2182             : }
    2183             : 
    2184           0 : SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
    2185             :   SDLoc SL(Op);
    2186           0 :   SDValue Src = Op.getOperand(0);
    2187             : 
    2188             :   // result = trunc(src);
    2189             :   // if (src < 0.0 && src != result)
    2190             :   //   result += -1.0.
    2191             : 
    2192           0 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2193             : 
    2194           0 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    2195           0 :   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
    2196             : 
    2197             :   EVT SetCCVT =
    2198           0 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2199             : 
    2200           0 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
    2201           0 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    2202           0 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    2203             : 
    2204           0 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
    2205             :   // TODO: Should this propagate fast-math-flags?
    2206           0 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    2207             : }
    2208             : 
    2209          74 : SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
    2210             :                                         double Log2BaseInverted) const {
    2211          74 :   EVT VT = Op.getValueType();
    2212             : 
    2213             :   SDLoc SL(Op);
    2214          74 :   SDValue Operand = Op.getOperand(0);
    2215          74 :   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
    2216          74 :   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
    2217             : 
    2218          74 :   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
    2219             : }
    2220             : 
    2221             : // Return M_LOG2E of appropriate type
    2222          50 : static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
    2223          50 :   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
    2224          37 :   case MVT::f32:
    2225          37 :     return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
    2226             :   case MVT::f16:
    2227             :     return DAG.getConstantFP(
    2228          26 :       APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
    2229          13 :       SL, VT);
    2230             :   case MVT::f64:
    2231             :     return DAG.getConstantFP(
    2232           0 :       APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
    2233           0 :   default:
    2234           0 :     llvm_unreachable("unsupported fp type");
    2235             :   }
    2236             : }
    2237             : 
    2238             : // exp2(M_LOG2E_F * f);
    2239          50 : SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
    2240          50 :   EVT VT = Op.getValueType();
    2241             :   SDLoc SL(Op);
    2242          50 :   SDValue Src = Op.getOperand(0);
    2243             : 
    2244          50 :   const SDValue K = getLog2EVal(DAG, SL, VT);
    2245          50 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
    2246          50 :   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
    2247             : }
    2248             : 
    2249             : static bool isCtlzOpc(unsigned Opc) {
    2250        3740 :   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
    2251             : }
    2252             : 
    2253             : static bool isCttzOpc(unsigned Opc) {
    2254        4585 :   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
    2255             : }
    2256             : 
    2257         415 : SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
    2258             :   SDLoc SL(Op);
    2259         415 :   SDValue Src = Op.getOperand(0);
    2260         415 :   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
    2261             :                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
    2262             : 
    2263             :   unsigned ISDOpc, NewOpc;
    2264         415 :   if (isCtlzOpc(Op.getOpcode())) {
    2265             :     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
    2266             :     NewOpc = AMDGPUISD::FFBH_U32;
    2267          73 :   } else if (isCttzOpc(Op.getOpcode())) {
    2268             :     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
    2269             :     NewOpc = AMDGPUISD::FFBL_B32;
    2270             :   } else
    2271           0 :     llvm_unreachable("Unexpected OPCode!!!");
    2272             : 
    2273             : 
    2274         415 :   if (ZeroUndef && Src.getValueType() == MVT::i32)
    2275         329 :     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
    2276             : 
    2277          86 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2278             : 
    2279          86 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2280          86 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2281             : 
    2282          86 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    2283          86 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    2284             : 
    2285             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2286         172 :                                    *DAG.getContext(), MVT::i32);
    2287             : 
    2288          86 :   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
    2289          86 :   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
    2290             : 
    2291          86 :   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
    2292          86 :   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
    2293             : 
    2294          86 :   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
    2295          86 :   SDValue Add, NewOpr;
    2296          86 :   if (isCtlzOpc(Op.getOpcode())) {
    2297          82 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
    2298             :     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
    2299          82 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
    2300             :   } else {
    2301           4 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
    2302             :     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
    2303           4 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
    2304             :   }
    2305             : 
    2306          86 :   if (!ZeroUndef) {
    2307             :     // Test if the full 64-bit input is zero.
    2308             : 
    2309             :     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
    2310             :     // which we probably don't want.
    2311           8 :     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
    2312           8 :     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
    2313           8 :     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
    2314             : 
    2315             :     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
    2316             :     // with the same cycles, otherwise it is slower.
    2317             :     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
    2318             :     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
    2319             : 
    2320           8 :     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
    2321             : 
    2322             :     // The instruction returns -1 for 0 input, but the defined intrinsic
    2323             :     // behavior is to return the number of bits.
    2324           8 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
    2325           8 :                          SrcIsZero, Bits32, NewOpr);
    2326             :   }
    2327             : 
    2328          86 :   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
    2329             : }
    2330             : 
    2331          67 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
    2332             :                                                bool Signed) const {
    2333             :   // Unsigned
    2334             :   // cul2f(ulong u)
    2335             :   //{
    2336             :   //  uint lz = clz(u);
    2337             :   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
    2338             :   //  u = (u << lz) & 0x7fffffffffffffffUL;
    2339             :   //  ulong t = u & 0xffffffffffUL;
    2340             :   //  uint v = (e << 23) | (uint)(u >> 40);
    2341             :   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
    2342             :   //  return as_float(v + r);
    2343             :   //}
    2344             :   // Signed
    2345             :   // cl2f(long l)
    2346             :   //{
    2347             :   //  long s = l >> 63;
    2348             :   //  float r = cul2f((l + s) ^ s);
    2349             :   //  return s ? -r : r;
    2350             :   //}
    2351             : 
    2352             :   SDLoc SL(Op);
    2353          67 :   SDValue Src = Op.getOperand(0);
    2354          67 :   SDValue L = Src;
    2355             : 
    2356          67 :   SDValue S;
    2357          67 :   if (Signed) {
    2358          32 :     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
    2359          32 :     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
    2360             : 
    2361          32 :     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
    2362          32 :     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
    2363             :   }
    2364             : 
    2365             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2366         134 :                                    *DAG.getContext(), MVT::f32);
    2367             : 
    2368             : 
    2369          67 :   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
    2370          67 :   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
    2371          67 :   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
    2372          67 :   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
    2373             : 
    2374          67 :   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
    2375             :   SDValue E = DAG.getSelect(SL, MVT::i32,
    2376             :     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
    2377             :     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
    2378          67 :     ZeroI32);
    2379             : 
    2380             :   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
    2381             :     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
    2382          67 :     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
    2383             : 
    2384             :   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
    2385          67 :                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
    2386             : 
    2387             :   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
    2388          67 :                              U, DAG.getConstant(40, SL, MVT::i64));
    2389             : 
    2390             :   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
    2391             :     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
    2392          67 :     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
    2393             : 
    2394          67 :   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
    2395          67 :   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
    2396          67 :   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
    2397             : 
    2398          67 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2399             : 
    2400          67 :   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
    2401             : 
    2402             :   SDValue R = DAG.getSelect(SL, MVT::i32,
    2403             :     RCmp,
    2404             :     One,
    2405          67 :     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
    2406          67 :   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
    2407          67 :   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
    2408             : 
    2409          67 :   if (!Signed)
    2410          35 :     return R;
    2411             : 
    2412          32 :   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
    2413          32 :   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
    2414             : }
    2415             : 
    2416          10 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
    2417             :                                                bool Signed) const {
    2418             :   SDLoc SL(Op);
    2419          10 :   SDValue Src = Op.getOperand(0);
    2420             : 
    2421          10 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2422             : 
    2423             :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2424          10 :                            DAG.getConstant(0, SL, MVT::i32));
    2425             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2426          10 :                            DAG.getConstant(1, SL, MVT::i32));
    2427             : 
    2428             :   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
    2429          18 :                               SL, MVT::f64, Hi);
    2430             : 
    2431          10 :   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
    2432             : 
    2433             :   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
    2434          10 :                               DAG.getConstant(32, SL, MVT::i32));
    2435             :   // TODO: Should this propagate fast-math-flags?
    2436          10 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
    2437             : }
    2438             : 
    2439          47 : SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
    2440             :                                                SelectionDAG &DAG) const {
    2441             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2442             :          "operation should be legal");
    2443             : 
    2444             :   // TODO: Factor out code common with LowerSINT_TO_FP.
    2445             : 
    2446             :   EVT DestVT = Op.getValueType();
    2447          47 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2448             :     SDLoc DL(Op);
    2449           4 :     SDValue Src = Op.getOperand(0);
    2450             : 
    2451           4 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2452           8 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2453             :     SDValue FPRound =
    2454           4 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2455             : 
    2456           4 :     return FPRound;
    2457             :   }
    2458             : 
    2459             :   if (DestVT == MVT::f32)
    2460          35 :     return LowerINT_TO_FP32(Op, DAG, false);
    2461             : 
    2462             :   assert(DestVT == MVT::f64);
    2463           8 :   return LowerINT_TO_FP64(Op, DAG, false);
    2464             : }
    2465             : 
    2466          38 : SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
    2467             :                                               SelectionDAG &DAG) const {
    2468             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2469             :          "operation should be legal");
    2470             : 
    2471             :   // TODO: Factor out code common with LowerUINT_TO_FP.
    2472             : 
    2473             :   EVT DestVT = Op.getValueType();
    2474          38 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2475             :     SDLoc DL(Op);
    2476           4 :     SDValue Src = Op.getOperand(0);
    2477             : 
    2478           4 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2479           8 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2480             :     SDValue FPRound =
    2481           4 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2482             : 
    2483           4 :     return FPRound;
    2484             :   }
    2485             : 
    2486             :   if (DestVT == MVT::f32)
    2487          32 :     return LowerINT_TO_FP32(Op, DAG, true);
    2488             : 
    2489             :   assert(DestVT == MVT::f64);
    2490           2 :   return LowerINT_TO_FP64(Op, DAG, true);
    2491             : }
    2492             : 
    2493          16 : SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
    2494             :                                                bool Signed) const {
    2495             :   SDLoc SL(Op);
    2496             : 
    2497          16 :   SDValue Src = Op.getOperand(0);
    2498             : 
    2499          16 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2500             : 
    2501             :   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
    2502          16 :                                  MVT::f64);
    2503             :   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
    2504          16 :                                  MVT::f64);
    2505             :   // TODO: Should this propagate fast-math-flags?
    2506          16 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
    2507             : 
    2508          16 :   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
    2509             : 
    2510             : 
    2511          16 :   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
    2512             : 
    2513             :   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
    2514          30 :                            MVT::i32, FloorMul);
    2515          16 :   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
    2516             : 
    2517          32 :   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
    2518             : 
    2519          16 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
    2520             : }
    2521             : 
    2522         983 : SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
    2523             :   SDLoc DL(Op);
    2524         983 :   SDValue N0 = Op.getOperand(0);
    2525             : 
    2526             :   // Convert to target node to get known bits
    2527             :   if (N0.getValueType() == MVT::f32)
    2528         934 :     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
    2529             : 
    2530          49 :   if (getTargetMachine().Options.UnsafeFPMath) {
    2531             :     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
    2532          10 :     return SDValue();
    2533             :   }
    2534             : 
    2535             :   assert(N0.getSimpleValueType() == MVT::f64);
    2536             : 
    2537             :   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
    2538             :   const unsigned ExpMask = 0x7ff;
    2539             :   const unsigned ExpBiasf64 = 1023;
    2540             :   const unsigned ExpBiasf16 = 15;
    2541          39 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    2542          39 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    2543          39 :   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
    2544             :   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
    2545          39 :                            DAG.getConstant(32, DL, MVT::i64));
    2546          39 :   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
    2547          39 :   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
    2548             :   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2549          39 :                           DAG.getConstant(20, DL, MVT::i64));
    2550          39 :   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
    2551          39 :                   DAG.getConstant(ExpMask, DL, MVT::i32));
    2552             :   // Subtract the fp64 exponent bias (1023) to get the real exponent and
    2553             :   // add the f16 bias (15) to get the biased exponent for the f16 format.
    2554          39 :   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
    2555          39 :                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
    2556             : 
    2557             :   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2558          39 :                           DAG.getConstant(8, DL, MVT::i32));
    2559          39 :   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
    2560          39 :                   DAG.getConstant(0xffe, DL, MVT::i32));
    2561             : 
    2562             :   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
    2563          39 :                                   DAG.getConstant(0x1ff, DL, MVT::i32));
    2564          39 :   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
    2565             : 
    2566          39 :   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
    2567          39 :   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
    2568             : 
    2569             :   // (M != 0 ? 0x0200 : 0) | 0x7c00;
    2570             :   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
    2571             :       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
    2572          39 :                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
    2573             : 
    2574             :   // N = M | (E << 12);
    2575             :   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2576             :       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
    2577          39 :                   DAG.getConstant(12, DL, MVT::i32)));
    2578             : 
    2579             :   // B = clamp(1-E, 0, 13);
    2580             :   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
    2581          39 :                                   One, E);
    2582          39 :   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
    2583          39 :   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
    2584          39 :                   DAG.getConstant(13, DL, MVT::i32));
    2585             : 
    2586             :   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2587          39 :                                    DAG.getConstant(0x1000, DL, MVT::i32));
    2588             : 
    2589          39 :   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
    2590          39 :   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
    2591          39 :   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
    2592          39 :   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
    2593             : 
    2594          39 :   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
    2595             :   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
    2596          39 :                               DAG.getConstant(0x7, DL, MVT::i32));
    2597          39 :   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
    2598          39 :                   DAG.getConstant(2, DL, MVT::i32));
    2599             :   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
    2600          39 :                                One, Zero, ISD::SETEQ);
    2601             :   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
    2602          39 :                                One, Zero, ISD::SETGT);
    2603          39 :   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
    2604          39 :   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
    2605             : 
    2606          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
    2607          39 :                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
    2608          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
    2609          39 :                       I, V, ISD::SETEQ);
    2610             : 
    2611             :   // Extract the sign bit.
    2612             :   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2613          39 :                             DAG.getConstant(16, DL, MVT::i32));
    2614          39 :   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
    2615          39 :                      DAG.getConstant(0x8000, DL, MVT::i32));
    2616             : 
    2617          39 :   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
    2618          39 :   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
    2619             : }
    2620             : 
    2621          65 : SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
    2622             :                                               SelectionDAG &DAG) const {
    2623          65 :   SDValue Src = Op.getOperand(0);
    2624             : 
    2625             :   // TODO: Factor out code common with LowerFP_TO_UINT.
    2626             : 
    2627             :   EVT SrcVT = Src.getValueType();
    2628          65 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2629             :     SDLoc DL(Op);
    2630             : 
    2631           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2632             :     SDValue FpToInt32 =
    2633           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2634             : 
    2635           3 :     return FpToInt32;
    2636             :   }
    2637             : 
    2638             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2639           2 :     return LowerFP64_TO_INT(Op, DAG, true);
    2640             : 
    2641          60 :   return SDValue();
    2642             : }
    2643             : 
    2644          37 : SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
    2645             :                                               SelectionDAG &DAG) const {
    2646          37 :   SDValue Src = Op.getOperand(0);
    2647             : 
    2648             :   // TODO: Factor out code common with LowerFP_TO_SINT.
    2649             : 
    2650             :   EVT SrcVT = Src.getValueType();
    2651          37 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2652             :     SDLoc DL(Op);
    2653             : 
    2654           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2655             :     SDValue FpToInt32 =
    2656           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2657             : 
    2658           3 :     return FpToInt32;
    2659             :   }
    2660             : 
    2661             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2662          14 :     return LowerFP64_TO_INT(Op, DAG, false);
    2663             : 
    2664          20 :   return SDValue();
    2665             : }
    2666             : 
    2667          16 : SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    2668             :                                                      SelectionDAG &DAG) const {
    2669          16 :   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
    2670          16 :   MVT VT = Op.getSimpleValueType();
    2671             :   MVT ScalarVT = VT.getScalarType();
    2672             : 
    2673             :   assert(VT.isVector());
    2674             : 
    2675          16 :   SDValue Src = Op.getOperand(0);
    2676             :   SDLoc DL(Op);
    2677             : 
    2678             :   // TODO: Don't scalarize on Evergreen?
    2679             :   unsigned NElts = VT.getVectorNumElements();
    2680             :   SmallVector<SDValue, 8> Args;
    2681          16 :   DAG.ExtractVectorElements(Src, Args, 0, NElts);
    2682             : 
    2683          16 :   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
    2684          66 :   for (unsigned I = 0; I < NElts; ++I)
    2685         100 :     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
    2686             : 
    2687          16 :   return DAG.getBuildVector(VT, DL, Args);
    2688             : }
    2689             : 
    2690             : //===----------------------------------------------------------------------===//
    2691             : // Custom DAG optimizations
    2692             : //===----------------------------------------------------------------------===//
    2693             : 
    2694             : static bool isU24(SDValue Op, SelectionDAG &DAG) {
    2695        8446 :   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
    2696             : }
    2697             : 
    2698        6146 : static bool isI24(SDValue Op, SelectionDAG &DAG) {
    2699        6146 :   EVT VT = Op.getValueType();
    2700       12292 :   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
    2701             :                                      // as unsigned 24-bit values.
    2702        6146 :     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
    2703             : }
    2704             : 
    2705       12412 : static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
    2706             :                         TargetLowering::DAGCombinerInfo &DCI) {
    2707             : 
    2708       12412 :   SelectionDAG &DAG = DCI.DAG;
    2709       12412 :   SDValue Op = Node24->getOperand(OpIdx);
    2710       12412 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    2711       12412 :   EVT VT = Op.getValueType();
    2712             : 
    2713       12412 :   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
    2714             :   APInt KnownZero, KnownOne;
    2715             :   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
    2716       12412 :   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
    2717         826 :     return true;
    2718             : 
    2719             :   return false;
    2720             : }
    2721             : 
    2722             : template <typename IntTy>
    2723          48 : static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
    2724             :                                uint32_t Width, const SDLoc &DL) {
    2725          48 :   if (Width + Offset < 32) {
    2726          20 :     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
    2727          20 :     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
    2728          20 :     return DAG.getConstant(Result, DL, MVT::i32);
    2729             :   }
    2730             : 
    2731          28 :   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
    2732             : }
    2733          24 : 
    2734             : static bool hasVolatileUser(SDNode *Val) {
    2735          24 :   for (SDNode *U : Val->uses()) {
    2736          10 :     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
    2737          10 :       if (M->isVolatile())
    2738          10 :         return true;
    2739             :     }
    2740             :   }
    2741          14 : 
    2742             :   return false;
    2743          24 : }
    2744             : 
    2745          24 : bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
    2746          10 :   // i32 vectors are the canonical memory type.
    2747          10 :   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
    2748          10 :     return false;
    2749             : 
    2750             :   if (!VT.isByteSized())
    2751          14 :     return false;
    2752             : 
    2753             :   unsigned Size = VT.getStoreSize();
    2754      183847 : 
    2755      503656 :   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
    2756       64612 :     return false;
    2757       64612 : 
    2758             :   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
    2759             :     return false;
    2760             : 
    2761             :   return true;
    2762             : }
    2763             : 
    2764             : // Replace load of an illegal type with a store of a bitcast to a friendlier
    2765      260930 : // type.
    2766             : SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
    2767      422895 :                                                  DAGCombinerInfo &DCI) const {
    2768      244429 :   if (!DCI.isBeforeLegalize())
    2769             :     return SDValue();
    2770       16501 : 
    2771             :   LoadSDNode *LN = cast<LoadSDNode>(N);
    2772             :   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
    2773             :     return SDValue();
    2774             : 
    2775       23972 :   SDLoc SL(N);
    2776             :   SelectionDAG &DAG = DCI.DAG;
    2777             :   EVT VT = LN->getMemoryVT();
    2778        8181 : 
    2779        1168 :   unsigned Size = VT.getStoreSize();
    2780             :   unsigned Align = LN->getAlignment();
    2781             :   if (Align < Size && isTypeLegal(VT)) {
    2782             :     bool IsFast;
    2783             :     unsigned AS = LN->getAddressSpace();
    2784             : 
    2785             :     // Expand unaligned loads earlier than legalization. Due to visitation order
    2786      234052 :     // problems during legalization, the emitted instructions to pack and unpack
    2787             :     // the bytes again are not eliminated in the case of an unaligned copy.
    2788      234052 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2789       29961 :       if (VT.isVector())
    2790             :         return scalarizeVectorLoad(LN, DAG);
    2791             : 
    2792      387938 :       SDValue Ops[2];
    2793       26653 :       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
    2794             :       return DAG.getMergeValues(Ops, SDLoc(N));
    2795             :     }
    2796      177438 : 
    2797      177438 :     if (!IsFast)
    2798             :       return SDValue();
    2799             :   }
    2800      177438 : 
    2801      177438 :   if (!shouldCombineMemoryType(VT))
    2802             :     return SDValue();
    2803             : 
    2804             :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2805             : 
    2806             :   SDValue NewLoad
    2807             :     = DAG.getLoad(NewVT, SL, LN->getChain(),
    2808       53971 :                   LN->getBasePtr(), LN->getMemOperand());
    2809         329 : 
    2810          48 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
    2811             :   DCI.CombineTo(N, BC, NewLoad.getValue(1));
    2812         281 :   return SDValue(N, 0);
    2813         281 : }
    2814         281 : 
    2815             : // Replace store of an illegal type with a store of a bitcast to a friendlier
    2816             : // type.
    2817       53642 : SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
    2818          23 :                                                   DAGCombinerInfo &DCI) const {
    2819             :   if (!DCI.isBeforeLegalize())
    2820             :     return SDValue();
    2821      177086 : 
    2822      172255 :   StoreSDNode *SN = cast<StoreSDNode>(N);
    2823             :   if (SN->isVolatile() || !ISD::isNormalStore(SN))
    2824        4831 :     return SDValue();
    2825             : 
    2826             :   EVT VT = SN->getMemoryVT();
    2827             :   unsigned Size = VT.getStoreSize();
    2828        9662 : 
    2829             :   SDLoc SL(N);
    2830        4831 :   SelectionDAG &DAG = DCI.DAG;
    2831        4831 :   unsigned Align = SN->getAlignment();
    2832        4831 :   if (Align < Size && isTypeLegal(VT)) {
    2833             :     bool IsFast;
    2834             :     unsigned AS = SN->getAddressSpace();
    2835             : 
    2836             :     // Expand unaligned stores earlier than legalization. Due to visitation
    2837      134066 :     // order problems during legalization, the emitted instructions to pack and
    2838             :     // unpack the bytes again are not eliminated in the case of an unaligned
    2839      134066 :     // copy.
    2840       34602 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2841             :       if (VT.isVector())
    2842             :         return scalarizeVectorStore(SN, DAG);
    2843       99464 : 
    2844       15337 :       return expandUnalignedStore(SN, DAG);
    2845             :     }
    2846       84127 : 
    2847             :     if (!IsFast)
    2848             :       return SDValue();
    2849             :   }
    2850       84127 : 
    2851       84127 :   if (!shouldCombineMemoryType(VT))
    2852       84127 :     return SDValue();
    2853             : 
    2854             :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2855             :   SDValue Val = SN->getValue();
    2856             : 
    2857             :   //DCI.AddToWorklist(Val.getNode());
    2858             : 
    2859             :   bool OtherUses = !Val.hasOneUse();
    2860       11637 :   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
    2861         283 :   if (OtherUses) {
    2862         283 :     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
    2863             :     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
    2864         232 :   }
    2865             : 
    2866             :   return DAG.getStore(SN->getChain(), SL, CastVal,
    2867       11354 :                       SN->getBasePtr(), SN->getMemOperand());
    2868           0 : }
    2869             : 
    2870             : // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
    2871       83844 : // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
    2872       81662 : // issues.
    2873             : SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
    2874        2182 :                                                         DAGCombinerInfo &DCI) const {
    2875        2182 :   SelectionDAG &DAG = DCI.DAG;
    2876             :   SDValue N0 = N->getOperand(0);
    2877             : 
    2878             :   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
    2879             :   //     (vt2 (truncate (assertzext vt0:x, vt1)))
    2880        2182 :   if (N0.getOpcode() == ISD::TRUNCATE) {
    2881        2182 :     SDValue N1 = N->getOperand(1);
    2882          33 :     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
    2883          33 :     SDLoc SL(N);
    2884             : 
    2885             :     SDValue Src = N0.getOperand(0);
    2886             :     EVT SrcVT = Src.getValueType();
    2887        4364 :     if (SrcVT.bitsGE(ExtVT)) {
    2888             :       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
    2889             :       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
    2890             :     }
    2891             :   }
    2892             : 
    2893       11426 :   return SDValue();
    2894             : }
    2895       11426 : /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
    2896       11426 : /// binary operation \p Opc to it with the corresponding constant operands.
    2897             : SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
    2898             :   DAGCombinerInfo &DCI, const SDLoc &SL,
    2899             :   unsigned Opc, SDValue LHS,
    2900       11426 :   uint32_t ValLo, uint32_t ValHi) const {
    2901           0 :   SelectionDAG &DAG = DCI.DAG;
    2902           0 :   SDValue Lo, Hi;
    2903             :   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
    2904             : 
    2905           0 :   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
    2906           0 :   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
    2907           0 : 
    2908           0 :   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
    2909           0 :   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
    2910             : 
    2911             :   // Re-visit the ands. It's possible we eliminated one of them and it could
    2912             :   // simplify the vector.
    2913       11426 :   DCI.AddToWorklist(Lo.getNode());
    2914             :   DCI.AddToWorklist(Hi.getNode());
    2915             : 
    2916             :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
    2917        1630 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2918             : }
    2919             : 
    2920             : SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
    2921        1630 :                                                 DAGCombinerInfo &DCI) const {
    2922             :   EVT VT = N->getValueType(0);
    2923        1630 : 
    2924             :   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2925        1630 :   if (!RHS)
    2926        1630 :     return SDValue();
    2927             : 
    2928        1630 :   SDValue LHS = N->getOperand(0);
    2929        1630 :   unsigned RHSVal = RHS->getZExtValue();
    2930             :   if (!RHSVal)
    2931             :     return LHS;
    2932             : 
    2933        1630 :   SDLoc SL(N);
    2934        1630 :   SelectionDAG &DAG = DCI.DAG;
    2935             : 
    2936        3260 :   switch (LHS->getOpcode()) {
    2937        1630 :   default:
    2938             :     break;
    2939             :   case ISD::ZERO_EXTEND:
    2940       19523 :   case ISD::SIGN_EXTEND:
    2941             :   case ISD::ANY_EXTEND: {
    2942       19523 :     SDValue X = LHS->getOperand(0);
    2943             : 
    2944       19523 :     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
    2945             :         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
    2946        3829 :       // Prefer build_vector as the canonical form if packed types are legal.
    2947             :       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
    2948       15694 :       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
    2949       15694 :        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
    2950       15694 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    2951           0 :     }
    2952             : 
    2953             :     // shl (ext x) => zext (shl x), if shift does not overflow int
    2954       15694 :     if (VT != MVT::i64)
    2955             :       break;
    2956       31388 :     KnownBits Known;
    2957             :     DAG.computeKnownBits(X, Known);
    2958             :     unsigned LZ = Known.countMinLeadingZeros();
    2959        4972 :     if (LZ < RHSVal)
    2960             :       break;
    2961             :     EVT XVT = X.getValueType();
    2962        4972 :     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
    2963             :     return DAG.getZExtOrTrunc(Shl, SL, VT);
    2964         617 :   }
    2965             :   }
    2966             : 
    2967             :   if (VT != MVT::i64)
    2968             :     return SDValue();
    2969          21 : 
    2970          21 :   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
    2971             : 
    2972             :   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
    2973             :   // common case, splitting this into a move and a 32-bit shift is faster and
    2974             :   // the same code size.
    2975             :   if (RHSVal < 32)
    2976             :     return SDValue();
    2977        4353 : 
    2978             :   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
    2979        4353 : 
    2980             :   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
    2981        2728 :   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
    2982        2728 : 
    2983        2728 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2984             : 
    2985             :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
    2986             :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2987             : }
    2988       10541 : 
    2989             : SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
    2990             :                                                 DAGCombinerInfo &DCI) const {
    2991             :   if (N->getValueType(0) != MVT::i64)
    2992             :     return SDValue();
    2993             : 
    2994             :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2995        2404 :   if (!RHS)
    2996        1259 :     return SDValue();
    2997             : 
    2998        1145 :   SelectionDAG &DAG = DCI.DAG;
    2999             :   SDLoc SL(N);
    3000        1145 :   unsigned RHSVal = RHS->getZExtValue();
    3001        1145 : 
    3002             :   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
    3003        1145 :   if (RHSVal == 32) {
    3004             :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    3005        2290 :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    3006        1145 :                                    DAG.getConstant(31, SL, MVT::i32));
    3007             : 
    3008             :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
    3009        6417 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    3010             :   }
    3011        6417 : 
    3012        5868 :   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
    3013             :   if (RHSVal == 63) {
    3014         549 :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    3015             :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    3016          34 :                                    DAG.getConstant(31, SL, MVT::i32));
    3017             :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
    3018         515 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    3019             :   }
    3020         515 : 
    3021             :   return SDValue();
    3022             : }
    3023         515 : 
    3024          12 : SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
    3025             :                                                 DAGCombinerInfo &DCI) const {
    3026           6 :   if (N->getValueType(0) != MVT::i64)
    3027             :     return SDValue();
    3028          12 : 
    3029           6 :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    3030             :   if (!RHS)
    3031             :     return SDValue();
    3032             : 
    3033         509 :   unsigned ShiftAmt = RHS->getZExtValue();
    3034         264 :   if (ShiftAmt < 32)
    3035             :     return SDValue();
    3036         132 : 
    3037         264 :   // srl i64:x, C for C >= 32
    3038         132 :   // =>
    3039             :   //   build_pair (srl hi_32(x), C - 32), 0
    3040             : 
    3041         377 :   SelectionDAG &DAG = DCI.DAG;
    3042             :   SDLoc SL(N);
    3043             : 
    3044       62750 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    3045             :   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    3046       62750 : 
    3047       53219 :   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
    3048             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
    3049        9531 :                            VecOp, One);
    3050             : 
    3051         431 :   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
    3052             :   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
    3053        9100 : 
    3054        9100 :   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
    3055         173 : 
    3056             :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
    3057             : }
    3058             : 
    3059             : SDValue AMDGPUTargetLowering::performTruncateCombine(
    3060             :   SDNode *N, DAGCombinerInfo &DCI) const {
    3061        8927 :   SDLoc SL(N);
    3062             :   SelectionDAG &DAG = DCI.DAG;
    3063             :   EVT VT = N->getValueType(0);
    3064        8927 :   SDValue Src = N->getOperand(0);
    3065        8927 : 
    3066             :   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
    3067       17854 :   if (Src.getOpcode() == ISD::BITCAST) {
    3068             :     SDValue Vec = Src.getOperand(0);
    3069        8927 :     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
    3070             :       SDValue Elt0 = Vec.getOperand(0);
    3071        8927 :       EVT EltVT = Elt0.getValueType();
    3072        8927 :       if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
    3073             :         if (EltVT.isFloatingPoint()) {
    3074       17854 :           Elt0 = DAG.getNode(ISD::BITCAST, SL,
    3075             :                              EltVT.changeTypeToInteger(), Elt0);
    3076        8927 :         }
    3077             : 
    3078             :         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
    3079       55273 :       }
    3080             :     }
    3081             :   }
    3082       55273 : 
    3083       55273 :   // Equivalent of above for accessing the high element of a vector as an
    3084       55273 :   // integer operation.
    3085             :   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
    3086             :   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
    3087       55273 :     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
    3088        5935 :       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
    3089        5935 :         SDValue BV = stripBitcast(Src.getOperand(0));
    3090        1974 :         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
    3091        1974 :             BV.getValueType().getVectorNumElements() == 2) {
    3092        1974 :           SDValue SrcElt = BV.getOperand(1);
    3093        1845 :           EVT SrcEltVT = SrcElt.getValueType();
    3094          11 :           if (SrcEltVT.isFloatingPoint()) {
    3095          11 :             SrcElt = DAG.getNode(ISD::BITCAST, SL,
    3096             :                                  SrcEltVT.changeTypeToInteger(), SrcElt);
    3097             :           }
    3098        1845 : 
    3099             :           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
    3100             :         }
    3101             :       }
    3102             :     }
    3103             :   }
    3104             : 
    3105             :   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
    3106       80021 :   //
    3107       26588 :   // i16 (trunc (srl i64:x, K)), K <= 16 ->
    3108       79572 :   //     i16 (trunc (srl (i32 (trunc x), K)))
    3109       19770 :   if (VT.getScalarSizeInBits() < 32) {
    3110       21082 :     EVT SrcVT = Src.getValueType();
    3111       21082 :     if (SrcVT.getScalarSizeInBits() > 32 &&
    3112        1184 :         (Src.getOpcode() == ISD::SRL ||
    3113        1184 :          Src.getOpcode() == ISD::SRA ||
    3114        1184 :          Src.getOpcode() == ISD::SHL)) {
    3115           6 :       SDValue Amt = Src.getOperand(1);
    3116           6 :       KnownBits Known;
    3117             :       DAG.computeKnownBits(Amt, Known);
    3118             :       unsigned Size = VT.getScalarSizeInBits();
    3119        1184 :       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
    3120             :           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
    3121             :         EVT MidVT = VT.isVector() ?
    3122             :           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    3123             :                            VT.getVectorNumElements()) : MVT::i32;
    3124             : 
    3125             :         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
    3126             :         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
    3127             :                                     Src.getOperand(0));
    3128             :         DCI.AddToWorklist(Trunc.getNode());
    3129       52244 : 
    3130       16623 :         if (Amt.getValueType() != NewShiftVT) {
    3131       16623 :           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
    3132        1529 :           DCI.AddToWorklist(Amt.getNode());
    3133        1473 :         }
    3134             : 
    3135        2924 :         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
    3136        2289 :                                           Trunc, Amt);
    3137        2924 :         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
    3138             :       }
    3139        5214 :     }
    3140        2290 :   }
    3141             : 
    3142           1 :   return SDValue();
    3143         636 : }
    3144             : 
    3145         635 : // We need to specifically handle i64 mul here to avoid unnecessary conversion
    3146             : // instructions. If we only match on the legalized i64 mul expansion,
    3147         635 : // SimplifyDemandedBits will be unable to remove them because there will be
    3148         635 : // multiple uses due to the separate mul + mulh[su].
    3149             : static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
    3150         635 :                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
    3151           1 :   if (Size <= 32) {
    3152           1 :     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3153             :     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
    3154             :   }
    3155             : 
    3156         635 :   // Because we want to eliminate extension instructions before the
    3157         635 :   // operation, we need to create a single user here (i.e. not the separate
    3158             :   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
    3159             : 
    3160             :   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
    3161             : 
    3162       51609 :   SDValue Mul = DAG.getNode(MulOpc, SL,
    3163             :                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
    3164             : 
    3165             :   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
    3166             :                      Mul.getValue(0), Mul.getValue(1));
    3167             : }
    3168             : 
    3169        1416 : SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
    3170             :                                                 DAGCombinerInfo &DCI) const {
    3171        1416 :   EVT VT = N->getValueType(0);
    3172        1362 : 
    3173        1362 :   unsigned Size = VT.getSizeInBits();
    3174             :   if (VT.isVector() || Size > 64)
    3175             :     return SDValue();
    3176             : 
    3177             :   // There are i16 integer mul/mad.
    3178             :   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
    3179             :     return SDValue();
    3180          54 : 
    3181             :   SelectionDAG &DAG = DCI.DAG;
    3182             :   SDLoc DL(N);
    3183          54 : 
    3184             :   SDValue N0 = N->getOperand(0);
    3185             :   SDValue N1 = N->getOperand(1);
    3186          54 : 
    3187             :   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
    3188             :   // in the source into any_extends if the result of the mul is truncated. Since
    3189        7810 :   // we can assume the high bits are whatever we want, use the underlying value
    3190             :   // to avoid the unknown high bits from interfering.
    3191        7810 :   if (N0.getOpcode() == ISD::ANY_EXTEND)
    3192             :     N0 = N0.getOperand(0);
    3193        7810 : 
    3194        7810 :   if (N1.getOpcode() == ISD::ANY_EXTEND)
    3195         338 :     N1 = N1.getOperand(0);
    3196             : 
    3197             :   SDValue Mul;
    3198        7472 : 
    3199         254 :   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
    3200             :     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3201        7218 :     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3202             :     Mul = getMul24(DAG, DL, N0, N1, Size, false);
    3203             :   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
    3204        7218 :     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3205        7218 :     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3206             :     Mul = getMul24(DAG, DL, N0, N1, Size, true);
    3207             :   } else {
    3208             :     return SDValue();
    3209             :   }
    3210             : 
    3211        7218 :   // We need to use sext even for MUL_U24, because MUL_U24 is used
    3212          83 :   // for signed multiply of 8 and 16-bit types.
    3213             :   return DAG.getSExtOrTrunc(Mul, DL, VT);
    3214        7218 : }
    3215          75 : 
    3216             : SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
    3217        7218 :                                                   DAGCombinerInfo &DCI) const {
    3218             :   EVT VT = N->getValueType(0);
    3219       15664 : 
    3220        1139 :   if (!Subtarget->hasMulI24() || VT.isVector())
    3221        1139 :     return SDValue();
    3222        1139 : 
    3223        6079 :   SelectionDAG &DAG = DCI.DAG;
    3224         277 :   SDLoc DL(N);
    3225         277 : 
    3226         277 :   SDValue N0 = N->getOperand(0);
    3227             :   SDValue N1 = N->getOperand(1);
    3228        5802 : 
    3229             :   if (!isI24(N0, DAG) || !isI24(N1, DAG))
    3230             :     return SDValue();
    3231             : 
    3232             :   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3233        1416 :   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3234             : 
    3235             :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
    3236         103 :   DCI.AddToWorklist(Mulhi.getNode());
    3237             :   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
    3238         103 : }
    3239             : 
    3240         176 : SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
    3241          30 :                                                   DAGCombinerInfo &DCI) const {
    3242             :   EVT VT = N->getValueType(0);
    3243          73 : 
    3244             :   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
    3245             :     return SDValue();
    3246          73 : 
    3247          73 :   SelectionDAG &DAG = DCI.DAG;
    3248             :   SDLoc DL(N);
    3249          73 : 
    3250          73 :   SDValue N0 = N->getOperand(0);
    3251             :   SDValue N1 = N->getOperand(1);
    3252           0 : 
    3253           0 :   if (!isU24(N0, DAG) || !isU24(N1, DAG))
    3254             :     return SDValue();
    3255           0 : 
    3256           0 :   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3257           0 :   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3258             : 
    3259             :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
    3260        3893 :   DCI.AddToWorklist(Mulhi.getNode());
    3261             :   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
    3262        3893 : }
    3263             : 
    3264        7786 : SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
    3265           0 :   SDNode *N, DAGCombinerInfo &DCI) const {
    3266             :   SelectionDAG &DAG = DCI.DAG;
    3267        3893 : 
    3268             :   // Simplify demanded bits before splitting into multiple users.
    3269             :   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
    3270        3893 :     return SDValue();
    3271        3893 : 
    3272             :   SDValue N0 = N->getOperand(0);
    3273        3893 :   SDValue N1 = N->getOperand(1);
    3274        3893 : 
    3275             :   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
    3276           0 : 
    3277           0 :   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3278             :   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
    3279           0 : 
    3280           0 :   SDLoc SL(N);
    3281           0 : 
    3282             :   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
    3283             :   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
    3284         142 :   return DAG.getMergeValues({ MulLo, MulHi }, SL);
    3285             : }
    3286         142 : 
    3287             : static bool isNegativeOne(SDValue Val) {
    3288             :   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
    3289         142 :     return C->isAllOnesValue();
    3290          88 :   return false;
    3291             : }
    3292          54 : 
    3293          54 : SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
    3294             :                                           SDValue Op,
    3295          54 :                                           const SDLoc &DL,
    3296             :                                           unsigned Opc) const {
    3297          54 :   EVT VT = Op.getValueType();
    3298          54 :   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
    3299             :   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
    3300             :                               LegalVT != MVT::i16))
    3301             :     return SDValue();
    3302          54 : 
    3303          54 :   if (VT != MVT::i32)
    3304         108 :     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
    3305             : 
    3306             :   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
    3307           0 :   if (VT != MVT::i32)
    3308             :     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
    3309           0 : 
    3310             :   return FFBX;
    3311             : }
    3312             : 
    3313          21 : // The native instructions return -1 on 0 input. Optimize out a select that
    3314             : // produces -1 on 0.
    3315             : //
    3316             : // TODO: If zero is not undef, we could also do this if the output is compared
    3317          21 : // against the bitwidth.
    3318          21 : //
    3319           3 : // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
    3320             : SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
    3321           0 :                                                  SDValue LHS, SDValue RHS,
    3322             :                                                  DAGCombinerInfo &DCI) const {
    3323             :   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
    3324          11 :   if (!CmpRhs || !CmpRhs->isNullValue())
    3325             :     return SDValue();
    3326          21 : 
    3327             :   SelectionDAG &DAG = DCI.DAG;
    3328          11 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
    3329             :   SDValue CmpLHS = Cond.getOperand(0);
    3330          21 : 
    3331             :   unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
    3332             :                                            AMDGPUISD::FFBH_U32;
    3333             : 
    3334             :   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
    3335             :   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
    3336             :   if (CCOpcode == ISD::SETEQ &&
    3337             :       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3338             :       RHS.getOperand(0) == CmpLHS &&
    3339             :       isNegativeOne(LHS)) {
    3340        9190 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3341             :   }
    3342             : 
    3343             :   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
    3344       12226 :   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
    3345        4678 :   if (CCOpcode == ISD::SETNE &&
    3346             :       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3347        4512 :       LHS.getOperand(0) == CmpLHS &&
    3348        4512 :       isNegativeOne(RHS)) {
    3349        4512 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3350             :   }
    3351        4512 : 
    3352             :   return SDValue();
    3353             : }
    3354             : 
    3355             : static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
    3356        2264 :                                          unsigned Op,
    3357        2262 :                                          const SDLoc &SL,
    3358        4514 :                                          SDValue Cond,
    3359           2 :                                          SDValue N1,
    3360           2 :                                          SDValue N2) {
    3361             :   SelectionDAG &DAG = DCI.DAG;
    3362             :   EVT VT = N1.getValueType();
    3363             : 
    3364             :   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
    3365         881 :                                   N1.getOperand(0), N2.getOperand(0));
    3366         858 :   DCI.AddToWorklist(NewSelect.getNode());
    3367        4533 :   return DAG.getNode(Op, SL, VT, NewSelect);
    3368          23 : }
    3369          19 : 
    3370             : // Pull a free FP operation out of a select so it may fold into uses.
    3371             : //
    3372        4491 : // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
    3373             : // select c, (fneg x), k -> fneg (select c, x, (fneg k))
    3374             : //
    3375           0 : // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
    3376             : // select c, (fabs x), +k -> fabs (select c, x, k)
    3377             : static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
    3378             :                                     SDValue N) {
    3379             :   SelectionDAG &DAG = DCI.DAG;
    3380             :   SDValue Cond = N.getOperand(0);
    3381           0 :   SDValue LHS = N.getOperand(1);
    3382           0 :   SDValue RHS = N.getOperand(2);
    3383             : 
    3384             :   EVT VT = N.getValueType();
    3385           0 :   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
    3386           0 :       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
    3387           0 :     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
    3388             :                                      SDLoc(N), Cond, LHS, RHS);
    3389             :   }
    3390             : 
    3391             :   bool Inv = false;
    3392             :   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
    3393             :     std::swap(LHS, RHS);
    3394             :     Inv = true;
    3395             :   }
    3396             : 
    3397           0 :   // TODO: Support vector constants.
    3398             :   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    3399           0 :   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
    3400           0 :     SDLoc SL(N);
    3401           0 :     // If one side is an fneg/fabs and the other is a constant, we can push the
    3402           0 :     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
    3403             :     SDValue NewLHS = LHS.getOperand(0);
    3404           0 :     SDValue NewRHS = RHS;
    3405           0 : 
    3406           0 :     // Careful: if the neg can be folded up, don't try to pull it back down.
    3407             :     bool ShouldFoldNeg = true;
    3408           0 : 
    3409             :     if (NewLHS.hasOneUse()) {
    3410             :       unsigned Opc = NewLHS.getOpcode();
    3411             :       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
    3412           0 :         ShouldFoldNeg = false;
    3413             :       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
    3414             :         ShouldFoldNeg = false;
    3415             :     }
    3416             : 
    3417             :     if (ShouldFoldNeg) {
    3418             :       if (LHS.getOpcode() == ISD::FNEG)
    3419           0 :         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3420           0 :       else if (CRHS->isNegative())
    3421             :         return SDValue();
    3422             : 
    3423           0 :       if (Inv)
    3424           0 :         std::swap(NewLHS, NewRHS);
    3425             : 
    3426             :       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
    3427             :                                       Cond, NewLHS, NewRHS);
    3428             :       DCI.AddToWorklist(NewSelect.getNode());
    3429           0 :       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
    3430             :     }
    3431           0 :   }
    3432             : 
    3433           0 :   return SDValue();
    3434             : }
    3435             : 
    3436             : 
    3437           0 : SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
    3438           0 :                                                    DAGCombinerInfo &DCI) const {
    3439           0 :   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
    3440           0 :     return Folded;
    3441           0 : 
    3442             :   SDValue Cond = N->getOperand(0);
    3443           0 :   if (Cond.getOpcode() != ISD::SETCC)
    3444             :     return SDValue();
    3445             : 
    3446             :   EVT VT = N->getValueType(0);
    3447           0 :   SDValue LHS = Cond.getOperand(0);
    3448           0 :   SDValue RHS = Cond.getOperand(1);
    3449           0 :   SDValue CC = Cond.getOperand(2);
    3450             : 
    3451             :   SDValue True = N->getOperand(1);
    3452             :   SDValue False = N->getOperand(2);
    3453           0 : 
    3454             :   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
    3455             :     SelectionDAG &DAG = DCI.DAG;
    3456             :     if ((DAG.isConstantValueOfAnyType(True) ||
    3457       11431 :          DAG.isConstantValueOfAnyType(True)) &&
    3458             :         (!DAG.isConstantValueOfAnyType(False) &&
    3459       11431 :          !DAG.isConstantValueOfAnyType(False))) {
    3460          66 :       // Swap cmp + select pair to move constant to false input.
    3461             :       // This will allow using VOPC cndmasks more often.
    3462       11365 :       // select (setcc x, y), k, x -> select (setcc y, x) x, x
    3463       11365 : 
    3464        1517 :       SDLoc SL(N);
    3465             :       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
    3466       19696 :                                             LHS.getValueType().isInteger());
    3467        9848 : 
    3468        9848 :       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
    3469        9848 :       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
    3470             :     }
    3471        9848 : 
    3472        9848 :     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
    3473             :       SDValue MinMax
    3474        9848 :         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
    3475        3920 :       // Revisit this node so we can catch min3/max3/med3 patterns.
    3476        6570 :       //DCI.AddToWorklist(MinMax.getNode());
    3477        5190 :       return MinMax;
    3478        1530 :     }
    3479         260 :   }
    3480             : 
    3481             :   // There's no reason to not do this if the condition has other uses.
    3482             :   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
    3483             : }
    3484             : 
    3485         260 : static bool isInv2Pi(const APFloat &APF) {
    3486         260 :   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
    3487             :   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
    3488         260 :   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
    3489         260 : 
    3490             :   return APF.bitwiseIsEqual(KF16) ||
    3491             :          APF.bitwiseIsEqual(KF32) ||
    3492         715 :          APF.bitwiseIsEqual(KF64);
    3493             : }
    3494         398 : 
    3495             : // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
    3496             : // additional cost to negate them.
    3497         398 : bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
    3498             :   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
    3499             :     if (C->isZero() && !C->isNegative())
    3500             :       return true;
    3501             : 
    3502       18380 :     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
    3503             :       return true;
    3504             :   }
    3505          36 : 
    3506          40 :   return false;
    3507          40 : }
    3508          40 : 
    3509             : static unsigned inverseMinMax(unsigned Opc) {
    3510          68 :   switch (Opc) {
    3511          60 :   case ISD::FMAXNUM:
    3512          24 :     return ISD::FMINNUM;
    3513             :   case ISD::FMINNUM:
    3514             :     return ISD::FMAXNUM;
    3515             :   case AMDGPUISD::FMAX_LEGACY:
    3516             :     return AMDGPUISD::FMIN_LEGACY;
    3517         143 :   case AMDGPUISD::FMIN_LEGACY:
    3518         143 :     return  AMDGPUISD::FMAX_LEGACY;
    3519         228 :   default:
    3520             :     llvm_unreachable("invalid min/max opcode");
    3521             :   }
    3522          81 : }
    3523          16 : 
    3524             : SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
    3525             :                                                  DAGCombinerInfo &DCI) const {
    3526             :   SelectionDAG &DAG = DCI.DAG;
    3527             :   SDValue N0 = N->getOperand(0);
    3528             :   EVT VT = N->getValueType(0);
    3529             : 
    3530          94 :   unsigned Opc = N0.getOpcode();
    3531             : 
    3532             :   // If the input has multiple uses and we can either fold the negate down, or
    3533          57 :   // the other uses cannot, give up. This both prevents unprofitable
    3534             :   // transformations and infinite loops: we won't repeatedly try to fold around
    3535           2 :   // a negate that has no 'good' form.
    3536             :   if (N0.hasOneUse()) {
    3537           3 :     // This may be able to fold into the source, but at a code size cost. Don't
    3538             :     // fold if the fold into the user is free.
    3539           0 :     if (allUsesHaveSourceMods(N, 0))
    3540           0 :       return SDValue();
    3541             :   } else {
    3542             :     if (fnegFoldsIntoOp(Opc) &&
    3543             :         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
    3544        3991 :       return SDValue();
    3545             :   }
    3546        3991 : 
    3547        3991 :   SDLoc SL(N);
    3548        7982 :   switch (Opc) {
    3549             :   case ISD::FADD: {
    3550             :     if (!mayIgnoreSignedZero(N0))
    3551             :       return SDValue();
    3552             : 
    3553             :     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
    3554             :     SDValue LHS = N0.getOperand(0);
    3555             :     SDValue RHS = N0.getOperand(1);
    3556        3991 : 
    3557             :     if (LHS.getOpcode() != ISD::FNEG)
    3558             :       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3559        3011 :     else
    3560        1070 :       LHS = LHS.getOperand(0);
    3561             : 
    3562        1333 :     if (RHS.getOpcode() != ISD::FNEG)
    3563         438 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3564         312 :     else
    3565             :       RHS = RHS.getOperand(0);
    3566             : 
    3567             :     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
    3568        2609 :     if (!N0.hasOneUse())
    3569         118 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3570          92 :     return Res;
    3571          94 :   }
    3572             :   case ISD::FMUL:
    3573             :   case AMDGPUISD::FMUL_LEGACY: {
    3574          24 :     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
    3575          24 :     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
    3576             :     SDValue LHS = N0.getOperand(0);
    3577          24 :     SDValue RHS = N0.getOperand(1);
    3578          16 : 
    3579             :     if (LHS.getOpcode() == ISD::FNEG)
    3580           8 :       LHS = LHS.getOperand(0);
    3581             :     else if (RHS.getOpcode() == ISD::FNEG)
    3582          24 :       RHS = RHS.getOperand(0);
    3583          20 :     else
    3584             :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3585           4 : 
    3586             :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
    3587          24 :     if (!N0.hasOneUse())
    3588          24 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3589           4 :     return Res;
    3590          24 :   }
    3591             :   case ISD::FMA:
    3592          86 :   case ISD::FMAD: {
    3593             :     if (!mayIgnoreSignedZero(N0))
    3594             :       return SDValue();
    3595             : 
    3596          86 :     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
    3597          86 :     SDValue LHS = N0.getOperand(0);
    3598             :     SDValue MHS = N0.getOperand(1);
    3599          86 :     SDValue RHS = N0.getOperand(2);
    3600          16 : 
    3601          70 :     if (LHS.getOpcode() == ISD::FNEG)
    3602           4 :       LHS = LHS.getOperand(0);
    3603             :     else if (MHS.getOpcode() == ISD::FNEG)
    3604          66 :       MHS = MHS.getOperand(0);
    3605             :     else
    3606          86 :       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
    3607          86 : 
    3608           9 :     if (RHS.getOpcode() != ISD::FNEG)
    3609          86 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3610             :     else
    3611          79 :       RHS = RHS.getOperand(0);
    3612             : 
    3613          53 :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
    3614          57 :     if (!N0.hasOneUse())
    3615             :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3616             :     return Res;
    3617          22 :   }
    3618          22 :   case ISD::FMAXNUM:
    3619          22 :   case ISD::FMINNUM:
    3620             :   case AMDGPUISD::FMAX_LEGACY:
    3621          22 :   case AMDGPUISD::FMIN_LEGACY: {
    3622          10 :     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
    3623          12 :     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
    3624           2 :     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
    3625             :     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
    3626          10 : 
    3627             :     SDValue LHS = N0.getOperand(0);
    3628          22 :     SDValue RHS = N0.getOperand(1);
    3629          18 : 
    3630             :     // 0 doesn't have a negated inline immediate.
    3631           4 :     // TODO: This constant check should be generalized to other operations.
    3632             :     if (isConstantCostlierToNegate(RHS))
    3633          22 :       return SDValue();
    3634          22 : 
    3635           2 :     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3636          22 :     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3637             :     unsigned Opposite = inverseMinMax(Opc);
    3638         143 : 
    3639             :     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
    3640             :     if (!N0.hasOneUse())
    3641             :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3642             :     return Res;
    3643             :   }
    3644             :   case AMDGPUISD::FMED3: {
    3645             :     SDValue Ops[3];
    3646             :     for (unsigned I = 0; I < 3; ++I)
    3647         143 :       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
    3648         143 : 
    3649             :     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
    3650             :     if (!N0.hasOneUse())
    3651             :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3652         143 :     return Res;
    3653          49 :   }
    3654             :   case ISD::FP_EXTEND:
    3655          94 :   case ISD::FTRUNC:
    3656          94 :   case ISD::FRINT:
    3657             :   case ISD::FNEARBYINT: // XXX - Should fround be handled?
    3658             :   case ISD::FSIN:
    3659          94 :   case ISD::FCANONICALIZE:
    3660          94 :   case AMDGPUISD::RCP:
    3661           8 :   case AMDGPUISD::RCP_LEGACY:
    3662          94 :   case AMDGPUISD::RCP_IFLAG:
    3663             :   case AMDGPUISD::SIN_HW: {
    3664          12 :     SDValue CvtSrc = N0.getOperand(0);
    3665          12 :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3666          48 :       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
    3667          72 :       // (fneg (rcp (fneg x))) -> (rcp x)
    3668             :       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
    3669          24 :     }
    3670          12 : 
    3671           2 :     if (!N0.hasOneUse())
    3672          12 :       return SDValue();
    3673             : 
    3674          96 :     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
    3675             :     // (fneg (rcp x)) -> (rcp (fneg x))
    3676             :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3677             :     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
    3678             :   }
    3679             :   case ISD::FP_ROUND: {
    3680             :     SDValue CvtSrc = N0.getOperand(0);
    3681             : 
    3682             :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3683             :       // (fneg (fp_round (fneg x))) -> (fp_round x)
    3684          96 :       return DAG.getNode(ISD::FP_ROUND, SL, VT,
    3685          96 :                          CvtSrc.getOperand(0), N0.getOperand(1));
    3686             :     }
    3687             : 
    3688          14 :     if (!N0.hasOneUse())
    3689             :       return SDValue();
    3690             : 
    3691          82 :     // (fneg (fp_round x)) -> (fp_round (fneg x))
    3692          30 :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3693             :     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
    3694             :   }
    3695             :   case ISD::FP16_TO_FP: {
    3696          52 :     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
    3697          52 :     // f16, but legalization of f16 fneg ends up pulling it out of the source.
    3698             :     // Put the fneg back as a legal source operation that can be matched later.
    3699          16 :     SDLoc SL(N);
    3700          16 : 
    3701             :     SDValue Src = N0.getOperand(0);
    3702          16 :     EVT SrcVT = Src.getValueType();
    3703             : 
    3704             :     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
    3705           0 :     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
    3706             :                                   DAG.getConstant(0x8000, SL, SrcVT));
    3707             :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
    3708          16 :   }
    3709           8 :   default:
    3710             :     return SDValue();
    3711             :   }
    3712           8 : }
    3713           8 : 
    3714             : SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
    3715             :                                                  DAGCombinerInfo &DCI) const {
    3716             :   SelectionDAG &DAG = DCI.DAG;
    3717             :   SDValue N0 = N->getOperand(0);
    3718             : 
    3719             :   if (!N0.hasOneUse())
    3720             :     return SDValue();
    3721          28 : 
    3722          28 :   switch (N0.getOpcode()) {
    3723             :   case ISD::FP16_TO_FP: {
    3724             :     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
    3725             :     SDLoc SL(N);
    3726          28 :     SDValue Src = N0.getOperand(0);
    3727          56 :     EVT SrcVT = Src.getValueType();
    3728             : 
    3729        2031 :     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
    3730        2031 :     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
    3731             :                                   DAG.getConstant(0x7fff, SL, SrcVT));
    3732             :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
    3733             :   }
    3734        1934 :   default:
    3735             :     return SDValue();
    3736        1934 :   }
    3737        1934 : }
    3738             : 
    3739        1934 : SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
    3740         293 :                                                 DAGCombinerInfo &DCI) const {
    3741             :   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
    3742        1641 :   if (!CFP)
    3743             :     return SDValue();
    3744             : 
    3745             :   // XXX - Should this flush denormals?
    3746          18 :   const APFloat &Val = CFP->getValueAPF();
    3747          18 :   APFloat One(Val.getSemantics(), "1.0");
    3748             :   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
    3749             : }
    3750             : 
    3751          18 : SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
    3752          36 :                                                 DAGCombinerInfo &DCI) const {
    3753             :   SelectionDAG &DAG = DCI.DAG;
    3754        1623 :   SDLoc DL(N);
    3755        1623 : 
    3756             :   switch(N->getOpcode()) {
    3757             :   default:
    3758             :     break;
    3759        1069 :   case ISD::BITCAST: {
    3760             :     EVT DestVT = N->getValueType(0);
    3761        1069 : 
    3762             :     // Push casts through vector builds. This helps avoid emitting a large
    3763        1067 :     // number of copies when materializing floating point vector constants.
    3764             :     //
    3765             :     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
    3766           2 :     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
    3767           2 :     if (DestVT.isVector()) {
    3768           6 :       SDValue Src = N->getOperand(0);
    3769             :       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
    3770             :         EVT SrcVT = Src.getValueType();
    3771      893662 :         unsigned NElts = DestVT.getVectorNumElements();
    3772             : 
    3773      893662 :         if (SrcVT.getVectorNumElements() == NElts) {
    3774             :           EVT DestEltVT = DestVT.getVectorElementType();
    3775             : 
    3776     1787324 :           SmallVector<SDValue, 8> CastedElts;
    3777             :           SDLoc SL(N);
    3778             :           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
    3779      114468 :             SDValue Elt = Src.getOperand(I);
    3780      228936 :             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
    3781             :           }
    3782             : 
    3783             :           return DAG.getBuildVector(DestVT, SL, CastedElts);
    3784             :         }
    3785             :       }
    3786             :     }
    3787      114468 : 
    3788       42942 :     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
    3789       42942 :       break;
    3790        6551 : 
    3791             :     // Fold bitcasts of constants.
    3792             :     //
    3793        6551 :     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
    3794        2744 :     // TODO: Generalize and move to DAGCombiner
    3795             :     SDValue Src = N->getOperand(0);
    3796             :     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
    3797             :       if (Src.getValueType() == MVT::i64) {
    3798       15683 :         SDLoc SL(N);
    3799       12939 :         uint64_t CVal = C->getZExtValue();
    3800       12939 :         return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
    3801             :                            DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3802             :                            DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3803        2744 :       }
    3804             :     }
    3805             : 
    3806             :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
    3807             :       const APInt &Val = C->getValueAPF().bitcastToAPInt();
    3808      170251 :       SDLoc SL(N);
    3809             :       uint64_t CVal = Val.getZExtValue();
    3810             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    3811             :                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3812             :                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3813             : 
    3814             :       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
    3815       75387 :     }
    3816             : 
    3817             :     break;
    3818             :   }
    3819         368 :   case ISD::SHL: {
    3820             :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3821             :       break;
    3822         368 : 
    3823             :     return performShlCombine(N, DCI);
    3824             :   }
    3825             :   case ISD::SRL: {
    3826             :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3827          42 :       break;
    3828             : 
    3829             :     return performSrlCombine(N, DCI);
    3830             :   }
    3831             :   case ISD::SRA: {
    3832          21 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3833             :       break;
    3834          21 : 
    3835             :     return performSraCombine(N, DCI);
    3836             :   }
    3837             :   case ISD::TRUNCATE:
    3838             :     return performTruncateCombine(N, DCI);
    3839       31970 :   case ISD::MUL:
    3840       31970 :     return performMulCombine(N, DCI);
    3841             :   case ISD::MULHS:
    3842             :     return performMulhsCombine(N, DCI);
    3843       19523 :   case ISD::MULHU:
    3844             :     return performMulhuCombine(N, DCI);
    3845       96823 :   case AMDGPUISD::MUL_I24:
    3846       96823 :   case AMDGPUISD::MUL_U24:
    3847             :   case AMDGPUISD::MULHI_I24:
    3848             :   case AMDGPUISD::MULHI_U24: {
    3849       62750 :     // If the first call to simplify is successfull, then N may end up being
    3850             :     // deleted, so we shouldn't call simplifyI24 again.
    3851       12735 :     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
    3852       12735 :     return SDValue();
    3853             :   }
    3854             :   case AMDGPUISD::MUL_LOHI_I24:
    3855        6417 :   case AMDGPUISD::MUL_LOHI_U24:
    3856             :     return performMulLoHi24Combine(N, DCI);
    3857       55273 :   case ISD::SELECT:
    3858       55273 :     return performSelectCombine(N, DCI);
    3859        7810 :   case ISD::FNEG:
    3860        7810 :     return performFNegCombine(N, DCI);
    3861         103 :   case ISD::FABS:
    3862         103 :     return performFAbsCombine(N, DCI);
    3863        3893 :   case AMDGPUISD::BFE_I32:
    3864        3893 :   case AMDGPUISD::BFE_U32: {
    3865        6284 :     assert(!N->getValueType(0).isVector() &&
    3866             :            "Vector handling of BFE not implemented");
    3867             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
    3868             :     if (!Width)
    3869             :       break;
    3870             : 
    3871        6284 :     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
    3872        6284 :     if (WidthVal == 0)
    3873             :       return DAG.getConstant(0, DL, MVT::i32);
    3874         142 : 
    3875             :     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
    3876         142 :     if (!Offset)
    3877       11431 :       break;
    3878       11431 : 
    3879        3991 :     SDValue BitsFrom = N->getOperand(0);
    3880        3991 :     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
    3881        1934 : 
    3882        1934 :     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
    3883         361 : 
    3884             :     if (OffsetVal == 0) {
    3885             :       // This is already sign / zero extended, so try to fold away extra BFEs.
    3886             :       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
    3887         361 : 
    3888             :       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
    3889             :       if (OpSignBits >= SignBits)
    3890             :         return BitsFrom;
    3891         349 : 
    3892         349 :       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
    3893          22 :       if (Signed) {
    3894             :         // This is a sign_extend_inreg. Replace it to take advantage of existing
    3895             :         // DAG Combines. If not eliminated, we will match back to BFE during
    3896             :         // selection.
    3897             : 
    3898             :         // TODO: The sext_inreg of extended types ends, although we can could
    3899         323 :         // handle them in a single BFE.
    3900         323 :         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
    3901             :                            DAG.getValueType(SmallVT));
    3902             :       }
    3903             : 
    3904         323 :       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
    3905             :     }
    3906          58 : 
    3907             :     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
    3908          58 :       if (Signed) {
    3909          58 :         return constantFoldBFE<int32_t>(DAG,
    3910          34 :                                         CVal->getSExtValue(),
    3911             :                                         OffsetVal,
    3912          24 :                                         WidthVal,
    3913          24 :                                         DL);
    3914             :       }
    3915             : 
    3916             :       return constantFoldBFE<uint32_t>(DAG,
    3917             :                                        CVal->getZExtValue(),
    3918             :                                        OffsetVal,
    3919             :                                        WidthVal,
    3920             :                                        DL);
    3921          18 :     }
    3922             : 
    3923             :     if ((OffsetVal + WidthVal) >= 32 &&
    3924           6 :         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
    3925             :       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
    3926             :       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
    3927             :                          BitsFrom, ShiftVal);
    3928          48 :     }
    3929             : 
    3930          24 :     if (BitsFrom.hasOneUse()) {
    3931             :       APInt Demanded = APInt::getBitsSet(32,
    3932             :                                          OffsetVal,
    3933          24 :                                          OffsetVal + WidthVal);
    3934             : 
    3935             :       KnownBits Known;
    3936             :       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    3937          24 :                                             !DCI.isBeforeLegalizeOps());
    3938             :       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    3939             :       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
    3940          24 :           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
    3941             :         DCI.CommitTargetLoweringOpt(TLO);
    3942             :       }
    3943         217 :     }
    3944          67 : 
    3945          65 :     break;
    3946             :   }
    3947         114 :   case ISD::LOAD:
    3948             :     return performLoadCombine(N, DCI);
    3949             :   case ISD::STORE:
    3950         152 :     return performStoreCombine(N, DCI);
    3951             :   case AMDGPUISD::RCP:
    3952             :   case AMDGPUISD::RCP_IFLAG:
    3953             :     return performRcpCombine(N, DCI);
    3954             :   case ISD::AssertZext:
    3955          36 :   case ISD::AssertSext:
    3956          36 :     return performAssertSZExtCombine(N, DCI);
    3957          72 :   }
    3958          36 :   return SDValue();
    3959          64 : }
    3960          28 : 
    3961          14 : //===----------------------------------------------------------------------===//
    3962             : // Helper functions
    3963             : //===----------------------------------------------------------------------===//
    3964             : 
    3965             : SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
    3966             :                                                    const TargetRegisterClass *RC,
    3967      234052 :                                                    unsigned Reg, EVT VT,
    3968      234052 :                                                    const SDLoc &SL,
    3969      134066 :                                                    bool RawReg) const {
    3970      134066 :   MachineFunction &MF = DAG.getMachineFunction();
    3971         512 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3972             :   unsigned VReg;
    3973         512 : 
    3974       11426 :   if (!MRI.isLiveIn(Reg)) {
    3975             :     VReg = MRI.createVirtualRegister(RC);
    3976       11426 :     MRI.addLiveIn(Reg, VReg);
    3977             :   } else {
    3978      330729 :     VReg = MRI.getLiveInVirtReg(Reg);
    3979             :   }
    3980             : 
    3981             :   if (RawReg)
    3982             :     return DAG.getRegister(VReg, VT);
    3983             : 
    3984             :   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
    3985       19354 : }
    3986             : 
    3987             : SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
    3988             :                                                   EVT VT,
    3989             :                                                   const SDLoc &SL,
    3990       19354 :                                                   int64_t Offset) const {
    3991       19354 :   MachineFunction &MF = DAG.getMachineFunction();
    3992             :   MachineFrameInfo &MFI = MF.getFrameInfo();
    3993             : 
    3994       19354 :   int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
    3995        2013 :   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
    3996             :   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
    3997             : 
    3998       17341 :   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
    3999             :                      MachineMemOperand::MODereferenceable |
    4000             :                      MachineMemOperand::MOInvariant);
    4001       19354 : }
    4002         258 : 
    4003             : SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
    4004       19096 :                                                    const SDLoc &SL,
    4005             :                                                    SDValue Chain,
    4006             :                                                    SDValue ArgVal,
    4007           8 :                                                    int64_t Offset) const {
    4008             :   MachineFunction &MF = DAG.getMachineFunction();
    4009             :   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
    4010             : 
    4011           8 :   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
    4012           8 :   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
    4013             :                                MachineMemOperand::MODereferenceable);
    4014           8 :   return Store;
    4015           8 : }
    4016           8 : 
    4017             : SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
    4018             :                                              const TargetRegisterClass *RC,
    4019             :                                              EVT VT, const SDLoc &SL,
    4020           8 :                                              const ArgDescriptor &Arg) const {
    4021             :   assert(Arg && "Attempting to load missing argument");
    4022             : 
    4023          10 :   if (Arg.isRegister())
    4024             :     return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
    4025             :   return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
    4026             : }
    4027             : 
    4028          10 : uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
    4029          10 :     const MachineFunction &MF, const ImplicitParameter Param) const {
    4030             :   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
    4031          10 :   const AMDGPUSubtarget &ST =
    4032             :       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
    4033          10 :   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
    4034          10 :   unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
    4035             :   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
    4036             :                        ExplicitArgOffset;
    4037        3590 :   switch (Param) {
    4038             :   case GRID_DIM:
    4039             :     return ArgOffset;
    4040             :   case GRID_OFFSET:
    4041             :     return ArgOffset + 4;
    4042             :   }
    4043        3590 :   llvm_unreachable("unexpected implicit parameter type");
    4044        3582 : }
    4045           8 : 
    4046             : #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
    4047             : 
    4048          44 : const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    4049             :   switch ((AMDGPUISD::NodeType)Opcode) {
    4050             :   case AMDGPUISD::FIRST_NUMBER: break;
    4051             :   // AMDIL DAG nodes
    4052          44 :   NODE_NAME_CASE(UMUL);
    4053          44 :   NODE_NAME_CASE(BRANCH_COND);
    4054             : 
    4055          44 :   // AMDGPU DAG nodes
    4056          44 :   NODE_NAME_CASE(IF)
    4057          44 :   NODE_NAME_CASE(ELSE)
    4058          44 :   NODE_NAME_CASE(LOOP)
    4059          44 :   NODE_NAME_CASE(CALL)
    4060           0 :   NODE_NAME_CASE(TC_RETURN)
    4061           0 :   NODE_NAME_CASE(TRAP)
    4062             :   NODE_NAME_CASE(RET_FLAG)
    4063           0 :   NODE_NAME_CASE(RETURN_TO_EPILOG)
    4064             :   NODE_NAME_CASE(ENDPGM)
    4065             :   NODE_NAME_CASE(DWORDADDR)
    4066             :   NODE_NAME_CASE(FRACT)
    4067             :   NODE_NAME_CASE(SETCC)
    4068           0 :   NODE_NAME_CASE(SETREG)
    4069           0 :   NODE_NAME_CASE(FMA_W_CHAIN)
    4070             :   NODE_NAME_CASE(FMUL_W_CHAIN)
    4071             :   NODE_NAME_CASE(CLAMP)
    4072             :   NODE_NAME_CASE(COS_HW)
    4073           0 :   NODE_NAME_CASE(SIN_HW)
    4074             :   NODE_NAME_CASE(FMAX_LEGACY)
    4075             :   NODE_NAME_CASE(FMIN_LEGACY)
    4076           0 :   NODE_NAME_CASE(FMAX3)
    4077           0 :   NODE_NAME_CASE(SMAX3)
    4078           0 :   NODE_NAME_CASE(UMAX3)
    4079           0 :   NODE_NAME_CASE(FMIN3)
    4080           0 :   NODE_NAME_CASE(SMIN3)
    4081           0 :   NODE_NAME_CASE(UMIN3)
    4082           0 :   NODE_NAME_CASE(FMED3)
    4083           0 :   NODE_NAME_CASE(SMED3)
    4084           0 :   NODE_NAME_CASE(UMED3)
    4085           0 :   NODE_NAME_CASE(FDOT2)
    4086           0 :   NODE_NAME_CASE(URECIP)
    4087           0 :   NODE_NAME_CASE(DIV_SCALE)
    4088           0 :   NODE_NAME_CASE(DIV_FMAS)
    4089           0 :   NODE_NAME_CASE(DIV_FIXUP)
    4090           0 :   NODE_NAME_CASE(FMAD_FTZ)
    4091           0 :   NODE_NAME_CASE(TRIG_PREOP)
    4092           0 :   NODE_NAME_CASE(RCP)
    4093           0 :   NODE_NAME_CASE(RSQ)
    4094           0 :   NODE_NAME_CASE(RCP_LEGACY)
    4095           0 :   NODE_NAME_CASE(RSQ_LEGACY)
    4096           0 :   NODE_NAME_CASE(RCP_IFLAG)
    4097           0 :   NODE_NAME_CASE(FMUL_LEGACY)
    4098           0 :   NODE_NAME_CASE(RSQ_CLAMP)
    4099           0 :   NODE_NAME_CASE(LDEXP)
    4100           0 :   NODE_NAME_CASE(FP_CLASS)
    4101           0 :   NODE_NAME_CASE(DOT4)
    4102           0 :   NODE_NAME_CASE(CARRY)
    4103           0 :   NODE_NAME_CASE(BORROW)
    4104           0 :   NODE_NAME_CASE(BFE_U32)
    4105           0 :   NODE_NAME_CASE(BFE_I32)
    4106           0 :   NODE_NAME_CASE(BFI)
    4107           0 :   NODE_NAME_CASE(BFM)
    4108           0 :   NODE_NAME_CASE(FFBH_U32)
    4109           0 :   NODE_NAME_CASE(FFBH_I32)
    4110           0 :   NODE_NAME_CASE(FFBL_B32)
    4111           0 :   NODE_NAME_CASE(MUL_U24)
    4112           0 :   NODE_NAME_CASE(MUL_I24)
    4113           0 :   NODE_NAME_CASE(MULHI_U24)
    4114           0 :   NODE_NAME_CASE(MULHI_I24)
    4115           0 :   NODE_NAME_CASE(MUL_LOHI_U24)
    4116           0 :   NODE_NAME_CASE(MUL_LOHI_I24)
    4117           0 :   NODE_NAME_CASE(MAD_U24)
    4118           0 :   NODE_NAME_CASE(MAD_I24)
    4119           0 :   NODE_NAME_CASE(MAD_I64_I32)
    4120           0 :   NODE_NAME_CASE(MAD_U64_U32)
    4121           0 :   NODE_NAME_CASE(PERM)
    4122           0 :   NODE_NAME_CASE(TEXTURE_FETCH)
    4123           0 :   NODE_NAME_CASE(EXPORT)
    4124           0 :   NODE_NAME_CASE(EXPORT_DONE)
    4125           0 :   NODE_NAME_CASE(R600_EXPORT)
    4126           0 :   NODE_NAME_CASE(CONST_ADDRESS)
    4127           0 :   NODE_NAME_CASE(REGISTER_LOAD)
    4128           0 :   NODE_NAME_CASE(REGISTER_STORE)
    4129           0 :   NODE_NAME_CASE(SAMPLE)
    4130           0 :   NODE_NAME_CASE(SAMPLEB)
    4131           0 :   NODE_NAME_CASE(SAMPLED)
    4132           0 :   NODE_NAME_CASE(SAMPLEL)
    4133           0 :   NODE_NAME_CASE(CVT_F32_UBYTE0)
    4134           0 :   NODE_NAME_CASE(CVT_F32_UBYTE1)
    4135           0 :   NODE_NAME_CASE(CVT_F32_UBYTE2)
    4136           0 :   NODE_NAME_CASE(CVT_F32_UBYTE3)
    4137           0 :   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
    4138           0 :   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
    4139           0 :   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
    4140           0 :   NODE_NAME_CASE(CVT_PK_I16_I32)
    4141           0 :   NODE_NAME_CASE(CVT_PK_U16_U32)
    4142           0 :   NODE_NAME_CASE(FP_TO_FP16)
    4143           0 :   NODE_NAME_CASE(FP16_ZEXT)
    4144           0 :   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
    4145           0 :   NODE_NAME_CASE(CONST_DATA_PTR)
    4146           0 :   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
    4147           0 :   NODE_NAME_CASE(KILL)
    4148           0 :   NODE_NAME_CASE(DUMMY_CHAIN)
    4149           0 :   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
    4150           0 :   NODE_NAME_CASE(INIT_EXEC)
    4151           0 :   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
    4152           0 :   NODE_NAME_CASE(SENDMSG)
    4153           0 :   NODE_NAME_CASE(SENDMSGHALT)
    4154           0 :   NODE_NAME_CASE(INTERP_MOV)
    4155           0 :   NODE_NAME_CASE(INTERP_P1)
    4156           0 :   NODE_NAME_CASE(INTERP_P2)
    4157           0 :   NODE_NAME_CASE(STORE_MSKOR)
    4158           0 :   NODE_NAME_CASE(LOAD_CONSTANT)
    4159           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
    4160           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
    4161           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
    4162           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
    4163           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
    4164           0 :   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
    4165           0 :   NODE_NAME_CASE(ATOMIC_INC)
    4166           0 :   NODE_NAME_CASE(ATOMIC_DEC)
    4167           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FADD)
    4168           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
    4169             :   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
    4170           0 :   NODE_NAME_CASE(BUFFER_LOAD)
    4171           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
    4172           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
    4173           0 :   NODE_NAME_CASE(SBUFFER_LOAD)
    4174           0 :   NODE_NAME_CASE(BUFFER_STORE)
    4175           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
    4176           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
    4177           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
    4178           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
    4179           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
    4180           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
    4181           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
    4182           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
    4183           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
    4184           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
    4185           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
    4186           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
    4187           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
    4188           0 : 
    4189           0 :   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
    4190           0 :   }
    4191           0 :   return nullptr;
    4192           0 : }
    4193           0 : 
    4194           0 : SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
    4195           0 :                                               SelectionDAG &DAG, int Enabled,
    4196           0 :                                               int &RefinementSteps,
    4197           0 :                                               bool &UseOneConstNR,
    4198           0 :                                               bool Reciprocal) const {
    4199           0 :   EVT VT = Operand.getValueType();
    4200           0 : 
    4201           0 :   if (VT == MVT::f32) {
    4202           0 :     RefinementSteps = 0;
    4203           0 :     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
    4204           0 :   }
    4205           0 : 
    4206           0 :   // TODO: There is also f64 rsq instruction, but the documentation is less
    4207           0 :   // clear on its precision.
    4208             : 
    4209             :   return SDValue();
    4210             : }
    4211           0 : 
    4212             : SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
    4213             :                                                SelectionDAG &DAG, int Enabled,
    4214           8 :                                                int &RefinementSteps) const {
    4215             :   EVT VT = Operand.getValueType();
    4216             : 
    4217             :   if (VT == MVT::f32) {
    4218             :     // Reciprocal, < 1 ulp error.
    4219           8 :     //
    4220             :     // This reciprocal approximation converges to < 0.5 ulp error with one
    4221             :     // newton rhapson performed with two fused multiple adds (FMAs).
    4222           5 : 
    4223          10 :     RefinementSteps = 0;
    4224             :     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
    4225             :   }
    4226             : 
    4227             :   // TODO: There is also f64 rcp instruction, but the documentation is less
    4228             :   // clear on its precision.
    4229           3 : 
    4230             :   return SDValue();
    4231             : }
    4232         326 : 
    4233             : void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
    4234             :     const SDValue Op, KnownBits &Known,
    4235         326 :     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
    4236             : 
    4237             :   Known.resetAll(); // Don't know anything.
    4238             : 
    4239             :   unsigned Opc = Op.getOpcode();
    4240             : 
    4241             :   switch (Opc) {
    4242             :   default:
    4243         290 :     break;
    4244         580 :   case AMDGPUISD::CARRY:
    4245             :   case AMDGPUISD::BORROW: {
    4246             :     Known.Zero = APInt::getHighBitsSet(32, 31);
    4247             :     break;
    4248             :   }
    4249             : 
    4250          36 :   case AMDGPUISD::BFE_I32:
    4251             :   case AMDGPUISD::BFE_U32: {
    4252             :     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4253      277927 :     if (!CWidth)
    4254             :       return;
    4255             : 
    4256             :     uint32_t Width = CWidth->getZExtValue() & 0x1f;
    4257             : 
    4258             :     if (Opc == AMDGPUISD::BFE_U32)
    4259             :       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
    4260             : 
    4261      277927 :     break;
    4262             :   }
    4263             :   case AMDGPUISD::FP_TO_FP16:
    4264        8428 :   case AMDGPUISD::FP16_ZEXT: {
    4265             :     unsigned BitWidth = Known.getBitWidth();
    4266        8428 : 
    4267        8428 :     // High bits are zero.
    4268             :     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
    4269             :     break;
    4270         492 :   }
    4271             :   case AMDGPUISD::MUL_U24:
    4272             :   case AMDGPUISD::MUL_I24: {
    4273             :     KnownBits LHSKnown, RHSKnown;
    4274             :     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
    4275             :     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
    4276         492 : 
    4277             :     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
    4278         492 :                       RHSKnown.countMinTrailingZeros();
    4279         920 :     Known.Zero.setLowBits(std::min(TrailZ, 32u));
    4280             : 
    4281             :     unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
    4282             :     unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
    4283             :     unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
    4284             :     if (MaxValBits >= 32)
    4285             :       break;
    4286             :     bool Negative = false;
    4287             :     if (Opc == AMDGPUISD::MUL_I24) {
    4288        2221 :       bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
    4289        2221 :       bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
    4290             :       bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
    4291             :       bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
    4292             :       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
    4293       57293 :         break;
    4294      114586 :       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
    4295       57293 :     }
    4296             :     if (Negative)
    4297       57293 :       Known.One.setHighBits(32 - MaxValBits);
    4298       57293 :     else
    4299       57293 :       Known.Zero.setHighBits(32 - MaxValBits);
    4300             :     break;
    4301       57293 :   }
    4302       57293 :   case AMDGPUISD::PERM: {
    4303       57293 :     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4304       57293 :     if (!CMask)
    4305             :       return;
    4306             : 
    4307       30013 :     KnownBits LHSKnown, RHSKnown;
    4308           5 :     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
    4309           5 :     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
    4310           5 :     unsigned Sel = CMask->getZExtValue();
    4311           5 : 
    4312           5 :     for (unsigned I = 0; I < 32; I += 8) {
    4313             :       unsigned SelBits = Sel & 0xff;
    4314           5 :       if (SelBits < 4) {
    4315             :         SelBits *= 8;
    4316             :         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
    4317             :         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
    4318             :       } else if (SelBits < 7) {
    4319             :         SelBits = (SelBits & 3) * 8;
    4320             :         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
    4321             :         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
    4322          20 :       } else if (SelBits == 0x0c) {
    4323             :         Known.Zero |= 0xff << I;
    4324             :       } else if (SelBits > 0x0c) {
    4325           0 :         Known.One |= 0xff << I;
    4326             :       }
    4327          20 :       Sel >>= 8;
    4328          40 :     }
    4329          20 :     break;
    4330          40 :   }
    4331             :   case ISD::INTRINSIC_WO_CHAIN: {
    4332         100 :     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4333          80 :     switch (IID) {
    4334          80 :     case Intrinsic::amdgcn_mbcnt_lo:
    4335          20 :     case Intrinsic::amdgcn_mbcnt_hi: {
    4336          20 :       const GCNSubtarget &ST =
    4337          20 :           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
    4338          60 :       // These return at most the wavefront size - 1.
    4339          34 :       unsigned Size = Op.getValueType().getSizeInBits();
    4340          34 :       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
    4341          34 :       break;
    4342          26 :     }
    4343          24 :     default:
    4344           2 :       break;
    4345           0 :     }
    4346             :   }
    4347          80 :   }
    4348             : }
    4349             : 
    4350             : unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
    4351      149030 :     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
    4352      298060 :     unsigned Depth) const {
    4353             :   switch (Op.getOpcode()) {
    4354        5727 :   case AMDGPUISD::BFE_I32: {
    4355             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4356             :     if (!Width)
    4357        5727 :       return 1;
    4358             : 
    4359        5727 :     unsigned SignBits = 32 - Width->getZExtValue() + 1;
    4360       11454 :     if (!isNullConstant(Op.getOperand(1)))
    4361             :       return SignBits;
    4362             : 
    4363             :     // TODO: Could probably figure something out with non-0 offsets.
    4364             :     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
    4365             :     return std::max(SignBits, Op0SignBits);
    4366             :   }
    4367             : 
    4368             :   case AMDGPUISD::BFE_U32: {
    4369             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4370        2070 :     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
    4371             :   }
    4372             : 
    4373        2070 :   case AMDGPUISD::CARRY:
    4374           2 :   case AMDGPUISD::BORROW:
    4375             :     return 31;
    4376             :   case AMDGPUISD::FP_TO_FP16:
    4377             :   case AMDGPUISD::FP16_ZEXT:
    4378             :     return 16;
    4379           4 :   default:
    4380           2 :     return 1;
    4381             :   }
    4382             : }
    4383             : 
    4384           0 : bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
    4385           0 :                                                         const SelectionDAG &DAG,
    4386             :                                                         bool SNaN,
    4387             :                                                         unsigned Depth) const {
    4388           0 :   unsigned Opcode = Op.getOpcode();
    4389             :   switch (Opcode) {
    4390           0 :   case AMDGPUISD::FMIN_LEGACY:
    4391             :   case AMDGPUISD::FMAX_LEGACY: {
    4392             :     if (SNaN)
    4393             :       return true;
    4394             : 
    4395             :     // TODO: Can check no nans on one of the operands for each one, but which
    4396           2 :     // one?
    4397             :     return false;
    4398           2 :   }
    4399        1368 :   case AMDGPUISD::FMUL_LEGACY:
    4400        1368 :   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
    4401             :     if (SNaN)
    4402             :       return true;
    4403             :     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
    4404          27 :            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
    4405             :   }
    4406             :   case AMDGPUISD::FMED3:
    4407             :   case AMDGPUISD::FMIN3:
    4408             :   case AMDGPUISD::FMAX3:
    4409          27 :   case AMDGPUISD::FMAD_FTZ: {
    4410           0 :     if (SNaN)
    4411             :       return true;
    4412           0 :     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
    4413           0 :            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
    4414             :            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
    4415             :   }
    4416             :   case AMDGPUISD::CVT_F32_UBYTE0:
    4417             :   case AMDGPUISD::CVT_F32_UBYTE1:
    4418             :   case AMDGPUISD::CVT_F32_UBYTE2:
    4419           1 :   case AMDGPUISD::CVT_F32_UBYTE3:
    4420             :     return true;
    4421           1 : 
    4422             :   case AMDGPUISD::RCP:
    4423           0 :   case AMDGPUISD::RSQ:
    4424           0 :   case AMDGPUISD::RCP_LEGACY:
    4425             :   case AMDGPUISD::RSQ_LEGACY:
    4426           1 :   case AMDGPUISD::RSQ_CLAMP: {
    4427             :     if (SNaN)
    4428             :       return true;
    4429             : 
    4430           1 :     // TODO: Need is known positive check.
    4431             :     return false;
    4432           0 :   }
    4433           0 :   case AMDGPUISD::LDEXP:
    4434           0 :   case AMDGPUISD::FRACT: {
    4435             :     if (SNaN)
    4436             :       return true;
    4437             :     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
    4438             :   }
    4439             :   case AMDGPUISD::DIV_SCALE:
    4440             :   case AMDGPUISD::DIV_FMAS:
    4441             :   case AMDGPUISD::DIV_FIXUP:
    4442           9 :   case AMDGPUISD::TRIG_PREOP:
    4443             :     // TODO: Refine on operands.
    4444             :     return SNaN;
    4445             :   case AMDGPUISD::SIN_HW:
    4446             :   case AMDGPUISD::COS_HW: {
    4447           9 :     // TODO: Need check for infinity
    4448           7 :     return SNaN;
    4449             :   }
    4450             :   case ISD::INTRINSIC_WO_CHAIN: {
    4451             :     unsigned IntrinsicID
    4452             :       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4453           2 :     // TODO: Handle more intrinsics
    4454             :     switch (IntrinsicID) {
    4455           2 :     case Intrinsic::amdgcn_cubeid:
    4456             :       return true;
    4457           0 : 
    4458             :     case Intrinsic::amdgcn_frexp_mant: {
    4459           0 :       if (SNaN)
    4460             :         return true;
    4461             :       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
    4462             :     }
    4463             :     case Intrinsic::amdgcn_cvt_pkrtz: {
    4464           0 :       if (SNaN)
    4465           2 :         return true;
    4466             :       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
    4467             :              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
    4468           2 :     }
    4469             :     case Intrinsic::amdgcn_fdot2:
    4470          11 :       // TODO: Refine on operand
    4471             :       return SNaN;
    4472          11 :     default:
    4473             :       return false;
    4474          11 :     }
    4475             :   }
    4476             :   default:
    4477             :     return false;
    4478           2 :   }
    4479           2 : }
    4480             : 
    4481           0 : TargetLowering::AtomicExpansionKind
    4482             : AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
    4483           0 :   if (RMW->getOperation() == AtomicRMWInst::Nand)
    4484           0 :     return AtomicExpansionKind::CmpXChg;
    4485             :   return AtomicExpansionKind::None;
    4486           0 : }

Generated by: LCOV version 1.13