LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1531 1803 84.9 %
Date: 2018-06-17 00:07:59 Functions: 110 115 95.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This is the parent TargetLowering class for hardware code gen
      12             : /// targets.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f
      17             : #define AMDGPU_LN2_F       0.693147180559945309417232121458176568f
      18             : #define AMDGPU_LN10_F      2.30258509299404568401799145468436421f
      19             : 
      20             : #include "AMDGPUISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUCallLowering.h"
      23             : #include "AMDGPUFrameLowering.h"
      24             : #include "AMDGPUIntrinsicInfo.h"
      25             : #include "AMDGPURegisterInfo.h"
      26             : #include "AMDGPUSubtarget.h"
      27             : #include "AMDGPUTargetMachine.h"
      28             : #include "Utils/AMDGPUBaseInfo.h"
      29             : #include "R600MachineFunctionInfo.h"
      30             : #include "SIInstrInfo.h"
      31             : #include "SIMachineFunctionInfo.h"
      32             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      33             : #include "llvm/CodeGen/CallingConvLower.h"
      34             : #include "llvm/CodeGen/MachineFunction.h"
      35             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      36             : #include "llvm/CodeGen/SelectionDAG.h"
      37             : #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
      38             : #include "llvm/IR/DataLayout.h"
      39             : #include "llvm/IR/DiagnosticInfo.h"
      40             : #include "llvm/Support/KnownBits.h"
      41             : using namespace llvm;
      42             : 
      43       41191 : static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
      44             :                             CCValAssign::LocInfo LocInfo,
      45             :                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
      46       41191 :   MachineFunction &MF = State.getMachineFunction();
      47       41191 :   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
      48             : 
      49       41191 :   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
      50             :                                          ArgFlags.getOrigAlign());
      51      123573 :   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
      52       41191 :   return true;
      53             : }
      54             : 
      55        1570 : static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
      56             :                            CCValAssign::LocInfo LocInfo,
      57             :                            ISD::ArgFlagsTy ArgFlags, CCState &State,
      58             :                            const TargetRegisterClass *RC,
      59             :                            unsigned NumRegs) {
      60        3140 :   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
      61        1570 :   unsigned RegResult = State.AllocateReg(RegList);
      62        1570 :   if (RegResult == AMDGPU::NoRegister)
      63             :     return false;
      64             : 
      65        3068 :   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
      66             :   return true;
      67             : }
      68             : 
      69         265 : static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      70             :                               CCValAssign::LocInfo LocInfo,
      71             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      72         265 :   switch (LocVT.SimpleTy) {
      73         265 :   case MVT::i64:
      74             :   case MVT::f64:
      75             :   case MVT::v2i32:
      76             :   case MVT::v2f32:
      77             :   case MVT::v4i16:
      78             :   case MVT::v4f16: {
      79             :     // Up to SGPR0-SGPR39
      80             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
      81         265 :                           &AMDGPU::SGPR_64RegClass, 20);
      82             :   }
      83             :   default:
      84             :     return false;
      85             :   }
      86             : }
      87             : 
      88             : // Allocate up to VGPR31.
      89             : //
      90             : // TODO: Since there are no VGPR alignent requirements would it be better to
      91             : // split into individual scalar registers?
      92        1305 : static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
      93             :                               CCValAssign::LocInfo LocInfo,
      94             :                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
      95        1305 :   switch (LocVT.SimpleTy) {
      96         834 :   case MVT::i64:
      97             :   case MVT::f64:
      98             :   case MVT::v2i32:
      99             :   case MVT::v2f32:
     100             :   case MVT::v4i16:
     101             :   case MVT::v4f16: {
     102             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     103         834 :                           &AMDGPU::VReg_64RegClass, 31);
     104             :   }
     105         270 :   case MVT::v4i32:
     106             :   case MVT::v4f32:
     107             :   case MVT::v2i64:
     108             :   case MVT::v2f64: {
     109             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     110         270 :                           &AMDGPU::VReg_128RegClass, 29);
     111             :   }
     112          55 :   case MVT::v8i32:
     113             :   case MVT::v8f32: {
     114             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     115          55 :                           &AMDGPU::VReg_256RegClass, 25);
     116             : 
     117             :   }
     118         146 :   case MVT::v16i32:
     119             :   case MVT::v16f32: {
     120             :     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
     121         146 :                           &AMDGPU::VReg_512RegClass, 17);
     122             : 
     123             :   }
     124             :   default:
     125             :     return false;
     126             :   }
     127             : }
     128             : 
     129             : #include "AMDGPUGenCallingConv.inc"
     130             : 
     131             : // Find a larger type to do a load / store of a vector with.
     132        4660 : EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     133             :   unsigned StoreSize = VT.getStoreSizeInBits();
     134        4660 :   if (StoreSize <= 32)
     135        1306 :     return EVT::getIntegerVT(Ctx, StoreSize);
     136             : 
     137             :   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     138        6708 :   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     139             : }
     140             : 
     141        8727 : unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
     142        8727 :   KnownBits Known;
     143       17454 :   EVT VT = Op.getValueType();
     144        8727 :   DAG.computeKnownBits(Op, Known);
     145             : 
     146       26181 :   return VT.getSizeInBits() - Known.countMinLeadingZeros();
     147             : }
     148             : 
     149        4324 : unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
     150        8648 :   EVT VT = Op.getValueType();
     151             : 
     152             :   // In order for this to be a signed 24-bit value, bit 23, must
     153             :   // be a sign bit.
     154        4324 :   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
     155             : }
     156             : 
     157        2527 : AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     158        2527 :                                            const AMDGPUSubtarget &STI)
     159        2527 :     : TargetLowering(TM), Subtarget(&STI) {
     160        2527 :   AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
     161             :   // Lower floating point store/load to integer store/load to reduce the number
     162             :   // of patterns in tablegen.
     163             :   setOperationAction(ISD::LOAD, MVT::f32, Promote);
     164             :   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
     165             : 
     166             :   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
     167             :   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
     168             : 
     169             :   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
     170             :   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
     171             : 
     172             :   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
     173             :   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
     174             : 
     175             :   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
     176             :   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
     177             : 
     178             :   setOperationAction(ISD::LOAD, MVT::i64, Promote);
     179             :   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
     180             : 
     181             :   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
     182             :   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
     183             : 
     184             :   setOperationAction(ISD::LOAD, MVT::f64, Promote);
     185             :   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
     186             : 
     187             :   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
     188             :   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
     189             : 
     190             :   // There are no 64-bit extloads. These should be done as a 32-bit extload and
     191             :   // an extension to 64-bit.
     192       17689 :   for (MVT VT : MVT::integer_valuetypes()) {
     193             :     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
     194             :     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
     195             :     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
     196             :   }
     197             : 
     198       17689 :   for (MVT VT : MVT::integer_valuetypes()) {
     199       15162 :     if (VT == MVT::i64)
     200        2527 :       continue;
     201             : 
     202             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     203             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
     204             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
     205             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
     206             : 
     207             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     208             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
     209             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
     210             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
     211             : 
     212             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     213             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
     214             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
     215             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
     216             :   }
     217             : 
     218      181944 :   for (MVT VT : MVT::integer_vector_valuetypes()) {
     219             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
     220             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
     221             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
     222             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
     223             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
     224             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
     225             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
     226             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
     227             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
     228             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
     229             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
     230             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
     231             :   }
     232             : 
     233        2527 :   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     234             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
     235             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
     236             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
     237             : 
     238             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     239             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
     240             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
     241             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
     242             : 
     243             :   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     244             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
     245             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
     246             :   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
     247             : 
     248             :   setOperationAction(ISD::STORE, MVT::f32, Promote);
     249             :   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
     250             : 
     251             :   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
     252             :   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
     253             : 
     254             :   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
     255             :   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
     256             : 
     257             :   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
     258             :   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
     259             : 
     260             :   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
     261             :   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
     262             : 
     263             :   setOperationAction(ISD::STORE, MVT::i64, Promote);
     264             :   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
     265             : 
     266             :   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
     267             :   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
     268             : 
     269             :   setOperationAction(ISD::STORE, MVT::f64, Promote);
     270             :   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
     271             : 
     272             :   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
     273             :   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
     274             : 
     275             :   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
     276             :   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
     277             :   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     278             :   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
     279             : 
     280             :   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
     281             :   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
     282             :   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
     283             :   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
     284             : 
     285             :   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
     286             :   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
     287             :   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
     288             :   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
     289             : 
     290             :   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
     291             :   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
     292             : 
     293             :   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
     294             :   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
     295             : 
     296             :   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
     297             :   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
     298             : 
     299             :   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
     300             :   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
     301             : 
     302             : 
     303             :   setOperationAction(ISD::Constant, MVT::i32, Legal);
     304             :   setOperationAction(ISD::Constant, MVT::i64, Legal);
     305             :   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
     306             :   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
     307             : 
     308             :   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
     309             :   setOperationAction(ISD::BRIND, MVT::Other, Expand);
     310             : 
     311             :   // This is totally unsupported, just custom lower to produce an error.
     312             :   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
     313             : 
     314             :   // Library functions.  These default to Expand, but we have instructions
     315             :   // for them.
     316             :   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     317             :   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
     318             :   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
     319             :   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
     320             :   setOperationAction(ISD::FABS,   MVT::f32, Legal);
     321             :   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     322             :   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
     323             :   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     324             :   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     325             :   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
     326             : 
     327             :   setOperationAction(ISD::FROUND, MVT::f32, Custom);
     328             :   setOperationAction(ISD::FROUND, MVT::f64, Custom);
     329             : 
     330             :   setOperationAction(ISD::FLOG, MVT::f32, Custom);
     331             :   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
     332             : 
     333        2527 :   if (Subtarget->has16BitInsts()) {
     334             :     setOperationAction(ISD::FLOG, MVT::f16, Custom);
     335             :     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
     336             :   }
     337             : 
     338             :   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
     339             :   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
     340             : 
     341             :   setOperationAction(ISD::FREM, MVT::f32, Custom);
     342             :   setOperationAction(ISD::FREM, MVT::f64, Custom);
     343             : 
     344             :   // v_mad_f32 does not support denormals according to some sources.
     345        2527 :   if (!Subtarget->hasFP32Denormals())
     346             :     setOperationAction(ISD::FMAD, MVT::f32, Legal);
     347             : 
     348             :   // Expand to fneg + fadd.
     349             :   setOperationAction(ISD::FSUB, MVT::f64, Expand);
     350             : 
     351             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
     352             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
     353             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
     354             :   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
     355             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
     356             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
     357             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
     358             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
     359             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
     360             :   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
     361             : 
     362        2527 :   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     363             :     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     364             :     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
     365             :     setOperationAction(ISD::FRINT, MVT::f64, Custom);
     366             :     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
     367             :   }
     368             : 
     369        2527 :   if (!Subtarget->hasBFI()) {
     370             :     // fcopysign can be done in a single instruction with BFI.
     371             :     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
     372             :     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     373             :   }
     374             : 
     375             :   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
     376             :   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
     377             :   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     378             : 
     379        2527 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     380       12635 :   for (MVT VT : ScalarIntVTs) {
     381             :     // These should use [SU]DIVREM, so set them to expand
     382             :     setOperationAction(ISD::SDIV, VT, Expand);
     383             :     setOperationAction(ISD::UDIV, VT, Expand);
     384             :     setOperationAction(ISD::SREM, VT, Expand);
     385             :     setOperationAction(ISD::UREM, VT, Expand);
     386             : 
     387             :     // GPU does not have divrem function for signed or unsigned.
     388             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     389             :     setOperationAction(ISD::UDIVREM, VT, Custom);
     390             : 
     391             :     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
     392             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     393             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     394             : 
     395             :     setOperationAction(ISD::BSWAP, VT, Expand);
     396             :     setOperationAction(ISD::CTTZ, VT, Expand);
     397             :     setOperationAction(ISD::CTLZ, VT, Expand);
     398             : 
     399             :     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
     400             :     setOperationAction(ISD::ADDC, VT, Legal);
     401             :     setOperationAction(ISD::SUBC, VT, Legal);
     402             :     setOperationAction(ISD::ADDE, VT, Legal);
     403             :     setOperationAction(ISD::SUBE, VT, Legal);
     404             :   }
     405             : 
     406        2527 :   if (!Subtarget->hasBCNT(32))
     407             :     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
     408             : 
     409        2527 :   if (!Subtarget->hasBCNT(64))
     410             :     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     411             : 
     412             :   // The hardware supports 32-bit ROTR, but not ROTL.
     413             :   setOperationAction(ISD::ROTL, MVT::i32, Expand);
     414             :   setOperationAction(ISD::ROTL, MVT::i64, Expand);
     415             :   setOperationAction(ISD::ROTR, MVT::i64, Expand);
     416             : 
     417             :   setOperationAction(ISD::MUL, MVT::i64, Expand);
     418             :   setOperationAction(ISD::MULHU, MVT::i64, Expand);
     419             :   setOperationAction(ISD::MULHS, MVT::i64, Expand);
     420             :   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     421             :   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     422             :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     423             :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     424             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     425             : 
     426             :   setOperationAction(ISD::SMIN, MVT::i32, Legal);
     427             :   setOperationAction(ISD::UMIN, MVT::i32, Legal);
     428             :   setOperationAction(ISD::SMAX, MVT::i32, Legal);
     429             :   setOperationAction(ISD::UMAX, MVT::i32, Legal);
     430             : 
     431        2527 :   if (Subtarget->hasFFBH())
     432             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
     433             : 
     434        2527 :   if (Subtarget->hasFFBL())
     435             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
     436             : 
     437             :   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
     438             :   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
     439             :   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
     440             :   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     441             : 
     442             :   // We only really have 32-bit BFE instructions (and 16-bit on VI).
     443             :   //
     444             :   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
     445             :   // effort to match them now. We want this to be false for i64 cases when the
     446             :   // extraction isn't restricted to the upper or lower half. Ideally we would
     447             :   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
     448             :   // span the midpoint are probably relatively rare, so don't worry about them
     449             :   // for now.
     450        2527 :   if (Subtarget->hasBFE())
     451             :     setHasExtractBitsInsn(true);
     452             : 
     453             :   static const MVT::SimpleValueType VectorIntTypes[] = {
     454             :     MVT::v2i32, MVT::v4i32
     455             :   };
     456             : 
     457       12635 :   for (MVT VT : VectorIntTypes) {
     458             :     // Expand the following operations for the current type by default.
     459             :     setOperationAction(ISD::ADD,  VT, Expand);
     460             :     setOperationAction(ISD::AND,  VT, Expand);
     461             :     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     462             :     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     463             :     setOperationAction(ISD::MUL,  VT, Expand);
     464             :     setOperationAction(ISD::MULHU, VT, Expand);
     465             :     setOperationAction(ISD::MULHS, VT, Expand);
     466             :     setOperationAction(ISD::OR,   VT, Expand);
     467             :     setOperationAction(ISD::SHL,  VT, Expand);
     468             :     setOperationAction(ISD::SRA,  VT, Expand);
     469             :     setOperationAction(ISD::SRL,  VT, Expand);
     470             :     setOperationAction(ISD::ROTL, VT, Expand);
     471             :     setOperationAction(ISD::ROTR, VT, Expand);
     472             :     setOperationAction(ISD::SUB,  VT, Expand);
     473             :     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     474             :     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     475             :     setOperationAction(ISD::SDIV, VT, Expand);
     476             :     setOperationAction(ISD::UDIV, VT, Expand);
     477             :     setOperationAction(ISD::SREM, VT, Expand);
     478             :     setOperationAction(ISD::UREM, VT, Expand);
     479             :     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     480             :     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     481             :     setOperationAction(ISD::SDIVREM, VT, Custom);
     482             :     setOperationAction(ISD::UDIVREM, VT, Expand);
     483             :     setOperationAction(ISD::SELECT, VT, Expand);
     484             :     setOperationAction(ISD::VSELECT, VT, Expand);
     485             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     486             :     setOperationAction(ISD::XOR,  VT, Expand);
     487             :     setOperationAction(ISD::BSWAP, VT, Expand);
     488             :     setOperationAction(ISD::CTPOP, VT, Expand);
     489             :     setOperationAction(ISD::CTTZ, VT, Expand);
     490             :     setOperationAction(ISD::CTLZ, VT, Expand);
     491             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     492             :     setOperationAction(ISD::SETCC, VT, Expand);
     493             :   }
     494             : 
     495             :   static const MVT::SimpleValueType FloatVectorTypes[] = {
     496             :     MVT::v2f32, MVT::v4f32
     497             :   };
     498             : 
     499       12635 :   for (MVT VT : FloatVectorTypes) {
     500             :     setOperationAction(ISD::FABS, VT, Expand);
     501             :     setOperationAction(ISD::FMINNUM, VT, Expand);
     502             :     setOperationAction(ISD::FMAXNUM, VT, Expand);
     503             :     setOperationAction(ISD::FADD, VT, Expand);
     504             :     setOperationAction(ISD::FCEIL, VT, Expand);
     505             :     setOperationAction(ISD::FCOS, VT, Expand);
     506             :     setOperationAction(ISD::FDIV, VT, Expand);
     507             :     setOperationAction(ISD::FEXP2, VT, Expand);
     508             :     setOperationAction(ISD::FLOG2, VT, Expand);
     509             :     setOperationAction(ISD::FREM, VT, Expand);
     510             :     setOperationAction(ISD::FLOG, VT, Expand);
     511             :     setOperationAction(ISD::FLOG10, VT, Expand);
     512             :     setOperationAction(ISD::FPOW, VT, Expand);
     513             :     setOperationAction(ISD::FFLOOR, VT, Expand);
     514             :     setOperationAction(ISD::FTRUNC, VT, Expand);
     515             :     setOperationAction(ISD::FMUL, VT, Expand);
     516             :     setOperationAction(ISD::FMA, VT, Expand);
     517             :     setOperationAction(ISD::FRINT, VT, Expand);
     518             :     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     519             :     setOperationAction(ISD::FSQRT, VT, Expand);
     520             :     setOperationAction(ISD::FSIN, VT, Expand);
     521             :     setOperationAction(ISD::FSUB, VT, Expand);
     522             :     setOperationAction(ISD::FNEG, VT, Expand);
     523             :     setOperationAction(ISD::VSELECT, VT, Expand);
     524             :     setOperationAction(ISD::SELECT_CC, VT, Expand);
     525             :     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     526             :     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     527             :     setOperationAction(ISD::SETCC, VT, Expand);
     528             :   }
     529             : 
     530             :   // This causes using an unrolled select operation rather than expansion with
     531             :   // bit operations. This is in general better, but the alternative using BFI
     532             :   // instructions may be better if the select sources are SGPRs.
     533             :   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
     534             :   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
     535             : 
     536             :   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
     537             :   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
     538             : 
     539             :   // There are no libcalls of any kind.
     540     2337475 :   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
     541             :     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
     542             : 
     543             :   setBooleanContents(ZeroOrNegativeOneBooleanContent);
     544             :   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     545             : 
     546             :   setSchedulingPreference(Sched::RegPressure);
     547        2527 :   setJumpIsExpensive(true);
     548             : 
     549             :   // FIXME: This is only partially true. If we have to do vector compares, any
     550             :   // SGPR pair can be a condition register. If we have a uniform condition, we
     551             :   // are better off doing SALU operations, where there is only one SCC. For now,
     552             :   // we don't have a way of knowing during instruction selection if a condition
     553             :   // will be uniform and we always use vector compares. Assume we are using
     554             :   // vector compares until that is fixed.
     555             :   setHasMultipleConditionRegisters(true);
     556             : 
     557             :   // SI at least has hardware support for floating point exceptions, but no way
     558             :   // of using or handling them is implemented. They are also optional in OpenCL
     559             :   // (Section 7.3)
     560        2527 :   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
     561             : 
     562        2527 :   PredictableSelectIsExpensive = false;
     563             : 
     564             :   // We want to find all load dependencies for long chains of stores to enable
     565             :   // merging into very wide vectors. The problem is with vectors with > 4
     566             :   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
     567             :   // vectors are a legal type, even though we have to split the loads
     568             :   // usually. When we can more precisely specify load legality per address
     569             :   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
     570             :   // smarter so that they can figure out what to do in 2 iterations without all
     571             :   // N > 4 stores on the same chain.
     572        2527 :   GatherAllAliasesMaxDepth = 16;
     573             : 
     574             :   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
     575             :   // about these during lowering.
     576        2527 :   MaxStoresPerMemcpy  = 0xffffffff;
     577        2527 :   MaxStoresPerMemmove = 0xffffffff;
     578        2527 :   MaxStoresPerMemset  = 0xffffffff;
     579             : 
     580             :   setTargetDAGCombine(ISD::BITCAST);
     581             :   setTargetDAGCombine(ISD::SHL);
     582             :   setTargetDAGCombine(ISD::SRA);
     583             :   setTargetDAGCombine(ISD::SRL);
     584             :   setTargetDAGCombine(ISD::TRUNCATE);
     585             :   setTargetDAGCombine(ISD::MUL);
     586             :   setTargetDAGCombine(ISD::MULHU);
     587             :   setTargetDAGCombine(ISD::MULHS);
     588             :   setTargetDAGCombine(ISD::SELECT);
     589             :   setTargetDAGCombine(ISD::SELECT_CC);
     590             :   setTargetDAGCombine(ISD::STORE);
     591             :   setTargetDAGCombine(ISD::FADD);
     592             :   setTargetDAGCombine(ISD::FSUB);
     593             :   setTargetDAGCombine(ISD::FNEG);
     594             :   setTargetDAGCombine(ISD::FABS);
     595             :   setTargetDAGCombine(ISD::AssertZext);
     596             :   setTargetDAGCombine(ISD::AssertSext);
     597        2527 : }
     598             : 
     599             : //===----------------------------------------------------------------------===//
     600             : // Target Information
     601             : //===----------------------------------------------------------------------===//
     602             : 
     603             : LLVM_READNONE
     604         768 : static bool fnegFoldsIntoOp(unsigned Opc) {
     605         768 :   switch (Opc) {
     606             :   case ISD::FADD:
     607             :   case ISD::FSUB:
     608             :   case ISD::FMUL:
     609             :   case ISD::FMA:
     610             :   case ISD::FMAD:
     611             :   case ISD::FMINNUM:
     612             :   case ISD::FMAXNUM:
     613             :   case ISD::FSIN:
     614             :   case ISD::FTRUNC:
     615             :   case ISD::FRINT:
     616             :   case ISD::FNEARBYINT:
     617             :   case AMDGPUISD::RCP:
     618             :   case AMDGPUISD::RCP_LEGACY:
     619             :   case AMDGPUISD::SIN_HW:
     620             :   case AMDGPUISD::FMUL_LEGACY:
     621             :   case AMDGPUISD::FMIN_LEGACY:
     622             :   case AMDGPUISD::FMAX_LEGACY:
     623             :     return true;
     624         552 :   default:
     625         552 :     return false;
     626             :   }
     627             : }
     628             : 
     629             : /// \p returns true if the operation will definitely need to use a 64-bit
     630             : /// encoding, and thus will use a VOP3 encoding regardless of the source
     631             : /// modifiers.
     632             : LLVM_READONLY
     633             : static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
     634        2250 :   return N->getNumOperands() > 2 || VT == MVT::f64;
     635             : }
     636             : 
     637             : // Most FP instructions support source modifiers, but this could be refined
     638             : // slightly.
     639             : LLVM_READONLY
     640        2887 : static bool hasSourceMods(const SDNode *N) {
     641        2887 :   if (isa<MemSDNode>(N))
     642             :     return false;
     643             : 
     644        5222 :   switch (N->getOpcode()) {
     645             :   case ISD::CopyToReg:
     646             :   case ISD::SELECT:
     647             :   case ISD::FDIV:
     648             :   case ISD::FREM:
     649             :   case ISD::INLINEASM:
     650             :   case AMDGPUISD::INTERP_P1:
     651             :   case AMDGPUISD::INTERP_P2:
     652             :   case AMDGPUISD::DIV_SCALE:
     653             : 
     654             :   // TODO: Should really be looking at the users of the bitcast. These are
     655             :   // problematic because bitcasts are used to legalize all stores to integer
     656             :   // types.
     657             :   case ISD::BITCAST:
     658             :     return false;
     659        2250 :   default:
     660        2250 :     return true;
     661             :   }
     662             : }
     663             : 
     664        2787 : bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
     665             :                                                  unsigned CostThreshold) {
     666             :   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
     667             :   // it is truly free to use a source modifier in all cases. If there are
     668             :   // multiple users but for each one will necessitate using VOP3, there will be
     669             :   // a code size increase. Try to avoid increasing code size unless we know it
     670             :   // will save on the instruction count.
     671             :   unsigned NumMayIncreaseSize = 0;
     672        5574 :   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
     673             : 
     674             :   // XXX - Should this limit number of uses to check?
     675        4025 :   for (const SDNode *U : N->uses()) {
     676        2887 :     if (!hasSourceMods(U))
     677             :       return false;
     678             : 
     679             :     if (!opMustUseVOP3Encoding(U, VT)) {
     680        1181 :       if (++NumMayIncreaseSize > CostThreshold)
     681             :         return false;
     682             :     }
     683             :   }
     684             : 
     685             :   return true;
     686             : }
     687             : 
     688      108503 : MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
     689      108503 :   return MVT::i32;
     690             : }
     691             : 
     692        1586 : bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
     693        1586 :   return true;
     694             : }
     695             : 
     696             : // The backend supports 32 and 64 bit floating point immediates.
     697             : // FIXME: Why are we reporting vectors of FP immediates as legal?
     698           0 : bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     699           0 :   EVT ScalarVT = VT.getScalarType();
     700             :   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
     701           0 :          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
     702             : }
     703             : 
     704             : // We don't want to shrink f64 / f32 constants.
     705           0 : bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
     706           0 :   EVT ScalarVT = VT.getScalarType();
     707           0 :   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
     708             : }
     709             : 
     710        2413 : bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
     711             :                                                  ISD::LoadExtType,
     712             :                                                  EVT NewVT) const {
     713             : 
     714             :   unsigned NewSize = NewVT.getStoreSizeInBits();
     715             : 
     716             :   // If we are reducing to a 32-bit load, this is always better.
     717        2413 :   if (NewSize == 32)
     718             :     return true;
     719             : 
     720        4134 :   EVT OldVT = N->getValueType(0);
     721             :   unsigned OldSize = OldVT.getStoreSizeInBits();
     722             : 
     723             :   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
     724             :   // extloads, so doing one requires using a buffer_load. In cases where we
     725             :   // still couldn't use a scalar load, using the wider load shouldn't really
     726             :   // hurt anything.
     727             : 
     728             :   // If the old size already had to be an extload, there's no harm in continuing
     729             :   // to reduce the width.
     730        2067 :   return (OldSize < 32);
     731             : }
     732             : 
     733       12626 : bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
     734             :                                                    EVT CastTy) const {
     735             : 
     736             :   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
     737             : 
     738       25252 :   if (LoadTy.getScalarType() == MVT::i32)
     739             :     return false;
     740             : 
     741             :   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
     742             :   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
     743             : 
     744        1972 :   return (LScalarSize < CastScalarSize) ||
     745        1972 :          (CastScalarSize >= 32);
     746             : }
     747             : 
     748             : // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
     749             : // profitable with the expansion for 64-bit since it's generally good to
     750             : // speculate things.
     751             : // FIXME: These should really have the size as a parameter.
     752          27 : bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
     753          27 :   return true;
     754             : }
     755             : 
     756          60 : bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
     757          60 :   return true;
     758             : }
     759             : 
     760     2576549 : bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
     761     5153098 :   switch (N->getOpcode()) {
     762             :     default:
     763             :     return false;
     764      125654 :     case ISD::EntryToken:
     765             :     case ISD::TokenFactor:
     766      125654 :       return true;
     767       15968 :     case ISD::INTRINSIC_WO_CHAIN:
     768             :     {
     769       47904 :       unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     770       15968 :       switch (IntrID) {
     771             :         default:
     772             :         return false;
     773          17 :         case Intrinsic::amdgcn_readfirstlane:
     774             :         case Intrinsic::amdgcn_readlane:
     775          17 :           return true;
     776             :       }
     777             :     }
     778             :     break;
     779             :     case ISD::LOAD:
     780             :     {
     781             :       const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
     782      354880 :       if (L->getMemOperand()->getAddrSpace()
     783             :       == Subtarget->getAMDGPUAS().CONSTANT_ADDRESS_32BIT)
     784             :         return true;
     785      177218 :       return false;
     786             :     }
     787             :     break;
     788             :   }
     789             : }
     790             : 
     791             : //===---------------------------------------------------------------------===//
     792             : // Target Properties
     793             : //===---------------------------------------------------------------------===//
     794             : 
     795        2010 : bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
     796             :   assert(VT.isFloatingPoint());
     797             : 
     798             :   // Packed operations do not have a fabs modifier.
     799         478 :   return VT == MVT::f32 || VT == MVT::f64 ||
     800        2488 :          (Subtarget->has16BitInsts() && VT == MVT::f16);
     801             : }
     802             : 
     803        3525 : bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
     804             :   assert(VT.isFloatingPoint());
     805         578 :   return VT == MVT::f32 || VT == MVT::f64 ||
     806         835 :          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
     807        3782 :          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
     808             : }
     809             : 
     810        4185 : bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
     811             :                                                          unsigned NumElem,
     812             :                                                          unsigned AS) const {
     813        4185 :   return true;
     814             : }
     815             : 
     816       27079 : bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
     817             :   // There are few operations which truly have vector input operands. Any vector
     818             :   // operation is going to involve operations on each component, and a
     819             :   // build_vector will be a copy per element, so it always makes sense to use a
     820             :   // build_vector input in place of the extracted element to avoid a copy into a
     821             :   // super register.
     822             :   //
     823             :   // We should probably only do this if all users are extracts only, but this
     824             :   // should be the common case.
     825       27079 :   return true;
     826             : }
     827             : 
     828       12014 : bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
     829             :   // Truncate is just accessing a subregister.
     830             : 
     831       12014 :   unsigned SrcSize = Source.getSizeInBits();
     832       12014 :   unsigned DestSize = Dest.getSizeInBits();
     833             : 
     834       12014 :   return DestSize < SrcSize && DestSize % 32 == 0 ;
     835             : }
     836             : 
     837         594 : bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
     838             :   // Truncate is just accessing a subregister.
     839             : 
     840         594 :   unsigned SrcSize = Source->getScalarSizeInBits();
     841         594 :   unsigned DestSize = Dest->getScalarSizeInBits();
     842             : 
     843         594 :   if (DestSize== 16 && Subtarget->has16BitInsts())
     844          20 :     return SrcSize >= 32;
     845             : 
     846         574 :   return DestSize < SrcSize && DestSize % 32 == 0;
     847             : }
     848             : 
     849          28 : bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
     850          28 :   unsigned SrcSize = Src->getScalarSizeInBits();
     851          28 :   unsigned DestSize = Dest->getScalarSizeInBits();
     852             : 
     853          28 :   if (SrcSize == 16 && Subtarget->has16BitInsts())
     854           0 :     return DestSize >= 32;
     855             : 
     856          28 :   return SrcSize == 32 && DestSize == 64;
     857             : }
     858             : 
     859        6253 : bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
     860             :   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
     861             :   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
     862             :   // this will enable reducing 64-bit operations the 32-bit, which is always
     863             :   // good.
     864             : 
     865             :   if (Src == MVT::i16)
     866             :     return Dest == MVT::i32 ||Dest == MVT::i64 ;
     867             : 
     868             :   return Src == MVT::i32 && Dest == MVT::i64;
     869             : }
     870             : 
     871        6118 : bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
     872       12236 :   return isZExtFree(Val.getValueType(), VT2);
     873             : }
     874             : 
     875        6256 : bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
     876             :   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
     877             :   // limited number of native 64-bit operations. Shrinking an operation to fit
     878             :   // in a single 32-bit register should always be helpful. As currently used,
     879             :   // this is much less general than the name suggests, and is only used in
     880             :   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
     881             :   // not profitable, and may actually be harmful.
     882        6256 :   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
     883             : }
     884             : 
     885             : //===---------------------------------------------------------------------===//
     886             : // TargetLowering Callbacks
     887             : //===---------------------------------------------------------------------===//
     888             : 
     889        3155 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
     890             :                                                   bool IsVarArg) {
     891        3155 :   switch (CC) {
     892             :   case CallingConv::AMDGPU_KERNEL:
     893             :   case CallingConv::SPIR_KERNEL:
     894             :     return CC_AMDGPU_Kernel;
     895        1136 :   case CallingConv::AMDGPU_VS:
     896             :   case CallingConv::AMDGPU_GS:
     897             :   case CallingConv::AMDGPU_PS:
     898             :   case CallingConv::AMDGPU_CS:
     899             :   case CallingConv::AMDGPU_HS:
     900             :   case CallingConv::AMDGPU_ES:
     901             :   case CallingConv::AMDGPU_LS:
     902        1136 :     return CC_AMDGPU;
     903        1993 :   case CallingConv::C:
     904             :   case CallingConv::Fast:
     905             :   case CallingConv::Cold:
     906        1993 :     return CC_AMDGPU_Func;
     907           0 :   default:
     908           0 :     report_fatal_error("Unsupported calling convention.");
     909             :   }
     910             : }
     911             : 
     912        4781 : CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
     913             :                                                     bool IsVarArg) {
     914        4781 :   switch (CC) {
     915             :   case CallingConv::AMDGPU_KERNEL:
     916             :   case CallingConv::SPIR_KERNEL:
     917             :     return CC_AMDGPU_Kernel;
     918        1075 :   case CallingConv::AMDGPU_VS:
     919             :   case CallingConv::AMDGPU_GS:
     920             :   case CallingConv::AMDGPU_PS:
     921             :   case CallingConv::AMDGPU_CS:
     922             :   case CallingConv::AMDGPU_HS:
     923             :   case CallingConv::AMDGPU_ES:
     924             :   case CallingConv::AMDGPU_LS:
     925        1075 :     return RetCC_SI_Shader;
     926        3706 :   case CallingConv::C:
     927             :   case CallingConv::Fast:
     928             :   case CallingConv::Cold:
     929        3706 :     return RetCC_AMDGPU_Func;
     930           0 :   default:
     931           0 :     report_fatal_error("Unsupported calling convention.");
     932             :   }
     933             : }
     934             : 
     935             : /// The SelectionDAGBuilder will automatically promote function arguments
     936             : /// with illegal types.  However, this does not work for the AMDGPU targets
     937             : /// since the function arguments are stored in memory as these illegal types.
     938             : /// In order to handle this properly we need to get the original types sizes
     939             : /// from the LLVM IR Function and fixup the ISD:InputArg values before
     940             : /// passing them to AnalyzeFormalArguments()
     941             : 
     942             : /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
     943             : /// input values across multiple registers.  Each item in the Ins array
     944             : /// represents a single value that will be stored in registers.  Ins[x].VT is
     945             : /// the value type of the value that will be stored in the register, so
     946             : /// whatever SDNode we lower the argument to needs to be this type.
     947             : ///
     948             : /// In order to correctly lower the arguments we need to know the size of each
     949             : /// argument.  Since Ins[x].VT gives us the size of the register that will
     950             : /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
     951             : /// for the orignal function argument so that we can deduce the correct memory
     952             : /// type to use for Ins[x].  In most cases the correct memory type will be
     953             : /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
     954             : /// we have a kernel argument of type v8i8, this argument will be split into
     955             : /// 8 parts and each part will be represented by its own item in the Ins array.
     956             : /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
     957             : /// the argument before it was split.  From this, we deduce that the memory type
     958             : /// for each individual part is i8.  We pass the memory type as LocVT to the
     959             : /// calling convention analysis function and the register type (Ins[x].VT) as
     960             : /// the ValVT.
     961       17546 : void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
     962             :                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
     963       58711 :   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     964       41165 :     const ISD::InputArg &In = Ins[i];
     965       41165 :     EVT MemVT;
     966             : 
     967       41165 :     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
     968             : 
     969       82330 :     if (!Subtarget->isAmdHsaOS() &&
     970             :         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
     971             :       // The ABI says the caller will extend these values to 32-bits.
     972        1860 :       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
     973       40235 :     } else if (NumRegs == 1) {
     974             :       // This argument is not split, so the IR type is the memory type.
     975             :       assert(!In.Flags.isSplit());
     976       37482 :       if (In.ArgVT.isExtended()) {
     977             :         // We have an extended type, like i24, so we should just use the register type
     978         108 :         MemVT = In.VT;
     979             :       } else {
     980       37374 :         MemVT = In.ArgVT;
     981             :       }
     982        7671 :     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
     983        4213 :                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
     984             :       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
     985             :       // We have a vector value which has been split into a vector with
     986             :       // the same scalar type, but fewer elements.  This should handle
     987             :       // all the floating-point vector types.
     988         730 :       MemVT = In.VT;
     989        3458 :     } else if (In.ArgVT.isVector() &&
     990        1435 :                In.ArgVT.getVectorNumElements() == NumRegs) {
     991             :       // This arg has been split so that each element is stored in a separate
     992             :       // register.
     993        1425 :       MemVT = In.ArgVT.getScalarType();
     994         598 :     } else if (In.ArgVT.isExtended()) {
     995             :       // We have an extended type, like i65.
     996          26 :       MemVT = In.VT;
     997             :     } else {
     998         572 :       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
     999             :       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
    1000         572 :       if (In.VT.isInteger()) {
    1001         572 :         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
    1002           0 :       } else if (In.VT.isVector()) {
    1003             :         assert(!In.VT.getScalarType().isFloatingPoint());
    1004           0 :         unsigned NumElements = In.VT.getVectorNumElements();
    1005             :         assert(MemoryBits % NumElements == 0);
    1006             :         // This vector type has been split into another vector type with
    1007             :         // a different elements size.
    1008             :         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
    1009           0 :                                          MemoryBits / NumElements);
    1010           0 :         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
    1011             :       } else {
    1012           0 :         llvm_unreachable("cannot deduce memory type.");
    1013             :       }
    1014             :     }
    1015             : 
    1016             :     // Convert one element vectors to scalar.
    1017       41165 :     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
    1018          50 :       MemVT = MemVT.getScalarType();
    1019             : 
    1020       41165 :     if (MemVT.isExtended()) {
    1021             :       // This should really only happen if we have vec3 arguments
    1022             :       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
    1023           0 :       MemVT = MemVT.getPow2VectorType(State.getContext());
    1024             :     }
    1025             : 
    1026             :     assert(MemVT.isSimple());
    1027       41165 :     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
    1028             :                     State);
    1029             :   }
    1030       17546 : }
    1031             : 
    1032       17566 : SDValue AMDGPUTargetLowering::LowerReturn(
    1033             :   SDValue Chain, CallingConv::ID CallConv,
    1034             :   bool isVarArg,
    1035             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1036             :   const SmallVectorImpl<SDValue> &OutVals,
    1037             :   const SDLoc &DL, SelectionDAG &DAG) const {
    1038             :   // FIXME: Fails for r600 tests
    1039             :   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
    1040             :   // "wave terminate should not have return values");
    1041       17566 :   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
    1042             : }
    1043             : 
    1044             : //===---------------------------------------------------------------------===//
    1045             : // Target specific lowering
    1046             : //===---------------------------------------------------------------------===//
    1047             : 
    1048             : /// Selects the correct CCAssignFn for a given CallingConvention value.
    1049        3117 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    1050             :                                                     bool IsVarArg) {
    1051        3117 :   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
    1052             : }
    1053             : 
    1054        4781 : CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
    1055             :                                                       bool IsVarArg) {
    1056        4781 :   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
    1057             : }
    1058             : 
    1059          27 : SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
    1060             :                                                   SelectionDAG &DAG,
    1061             :                                                   MachineFrameInfo &MFI,
    1062             :                                                   int ClobberedFI) const {
    1063             :   SmallVector<SDValue, 8> ArgChains;
    1064             :   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
    1065          27 :   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
    1066             : 
    1067             :   // Include the original chain at the beginning of the list. When this is
    1068             :   // used by target LowerCall hooks, this helps legalize find the
    1069             :   // CALLSEQ_BEGIN node.
    1070          27 :   ArgChains.push_back(Chain);
    1071             : 
    1072             :   // Add a chain value for each stack argument corresponding
    1073          27 :   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
    1074             :                             UE = DAG.getEntryNode().getNode()->use_end();
    1075         918 :        U != UE; ++U) {
    1076             :     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
    1077             :       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
    1078          72 :         if (FI->getIndex() < 0) {
    1079             :           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
    1080             :           int64_t InLastByte = InFirstByte;
    1081          72 :           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
    1082             : 
    1083         132 :           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
    1084          60 :               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
    1085          12 :             ArgChains.push_back(SDValue(L, 1));
    1086             :         }
    1087             :       }
    1088             :     }
    1089             :   }
    1090             : 
    1091             :   // Build a tokenfactor for all the chains.
    1092          81 :   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
    1093             : }
    1094             : 
    1095          83 : SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
    1096             :                                                  SmallVectorImpl<SDValue> &InVals,
    1097             :                                                  StringRef Reason) const {
    1098          83 :   SDValue Callee = CLI.Callee;
    1099          83 :   SelectionDAG &DAG = CLI.DAG;
    1100             : 
    1101          83 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1102             : 
    1103             :   StringRef FuncName("<unknown>");
    1104             : 
    1105             :   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    1106         128 :     FuncName = G->getSymbol();
    1107             :   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    1108          17 :     FuncName = G->getGlobal()->getName();
    1109             : 
    1110             :   DiagnosticInfoUnsupported NoCalls(
    1111         166 :     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
    1112          83 :   DAG.getContext()->diagnose(NoCalls);
    1113             : 
    1114          82 :   if (!CLI.IsTailCall) {
    1115         151 :     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
    1116         146 :       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
    1117             :   }
    1118             : 
    1119          82 :   return DAG.getEntryNode();
    1120             : }
    1121             : 
    1122          77 : SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    1123             :                                         SmallVectorImpl<SDValue> &InVals) const {
    1124          77 :   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
    1125             : }
    1126             : 
    1127           3 : SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    1128             :                                                       SelectionDAG &DAG) const {
    1129           3 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1130             : 
    1131             :   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
    1132           9 :                                             SDLoc(Op).getDebugLoc());
    1133           3 :   DAG.getContext()->diagnose(NoDynamicAlloca);
    1134           9 :   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
    1135           9 :   return DAG.getMergeValues(Ops, SDLoc());
    1136             : }
    1137             : 
    1138       22234 : SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    1139             :                                              SelectionDAG &DAG) const {
    1140       22234 :   switch (Op.getOpcode()) {
    1141           0 :   default:
    1142           0 :     Op->print(errs(), &DAG);
    1143           0 :     llvm_unreachable("Custom lowering code for this"
    1144             :                      "instruction is not implemented yet!");
    1145             :     break;
    1146          16 :   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    1147        3282 :   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    1148       16685 :   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    1149         326 :   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    1150         158 :   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    1151          36 :   case ISD::FREM: return LowerFREM(Op, DAG);
    1152          31 :   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    1153          75 :   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    1154          14 :   case ISD::FRINT: return LowerFRINT(Op, DAG);
    1155          45 :   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    1156          90 :   case ISD::FROUND: return LowerFROUND(Op, DAG);
    1157           0 :   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    1158          37 :   case ISD::FLOG:
    1159          37 :     return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
    1160          37 :   case ISD::FLOG10:
    1161          37 :     return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
    1162          38 :   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    1163          47 :   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    1164         797 :   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
    1165          65 :   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    1166          37 :   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
    1167         415 :   case ISD::CTTZ:
    1168             :   case ISD::CTTZ_ZERO_UNDEF:
    1169             :   case ISD::CTLZ:
    1170             :   case ISD::CTLZ_ZERO_UNDEF:
    1171         415 :     return LowerCTLZ_CTTZ(Op, DAG);
    1172           3 :   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    1173             :   }
    1174             :   return Op;
    1175             : }
    1176             : 
    1177          53 : void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    1178             :                                               SmallVectorImpl<SDValue> &Results,
    1179             :                                               SelectionDAG &DAG) const {
    1180             :   switch (N->getOpcode()) {
    1181             :   case ISD::SIGN_EXTEND_INREG:
    1182             :     // Different parts of legalization seem to interpret which type of
    1183             :     // sign_extend_inreg is the one to check for custom lowering. The extended
    1184             :     // from type is what really matters, but some places check for custom
    1185             :     // lowering of the result type. This results in trying to use
    1186             :     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    1187             :     // nothing here and let the illegal result integer be handled normally.
    1188             :     return;
    1189             :   default:
    1190             :     return;
    1191             :   }
    1192             : }
    1193             : 
    1194             : static bool hasDefinedInitializer(const GlobalValue *GV) {
    1195             :   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
    1196         402 :   if (!GVar || !GVar->hasInitializer())
    1197             :     return false;
    1198             : 
    1199         396 :   return !isa<UndefValue>(GVar->getInitializer());
    1200             : }
    1201             : 
    1202         402 : SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    1203             :                                                  SDValue Op,
    1204             :                                                  SelectionDAG &DAG) const {
    1205             : 
    1206         402 :   const DataLayout &DL = DAG.getDataLayout();
    1207             :   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    1208         402 :   const GlobalValue *GV = G->getGlobal();
    1209             : 
    1210         402 :   if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
    1211           0 :       G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
    1212         402 :     if (!MFI->isEntryFunction()) {
    1213           1 :       const Function &Fn = DAG.getMachineFunction().getFunction();
    1214             :       DiagnosticInfoUnsupported BadLDSDecl(
    1215           3 :         Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
    1216           1 :       DAG.getContext()->diagnose(BadLDSDecl);
    1217             :     }
    1218             : 
    1219             :     // XXX: What does the value of G->getOffset() mean?
    1220             :     assert(G->getOffset() == 0 &&
    1221             :          "Do not know what to do with an non-zero offset");
    1222             : 
    1223             :     // TODO: We could emit code to handle the initialization somewhere.
    1224         396 :     if (!hasDefinedInitializer(GV)) {
    1225         394 :       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
    1226         788 :       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
    1227             :     }
    1228             :   }
    1229             : 
    1230           8 :   const Function &Fn = DAG.getMachineFunction().getFunction();
    1231             :   DiagnosticInfoUnsupported BadInit(
    1232          24 :       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
    1233           8 :   DAG.getContext()->diagnose(BadInit);
    1234           8 :   return SDValue();
    1235             : }
    1236             : 
    1237        3282 : SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    1238             :                                                   SelectionDAG &DAG) const {
    1239             :   SmallVector<SDValue, 8> Args;
    1240             : 
    1241        3282 :   EVT VT = Op.getValueType();
    1242             :   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
    1243             :     SDLoc SL(Op);
    1244          71 :     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
    1245          71 :     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
    1246             : 
    1247         142 :     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
    1248          71 :     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
    1249             :   }
    1250             : 
    1251       16063 :   for (const SDUse &U : Op->ops())
    1252        6426 :     DAG.ExtractVectorElements(U.get(), Args);
    1253             : 
    1254        6422 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1255             : }
    1256             : 
    1257       16685 : SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    1258             :                                                      SelectionDAG &DAG) const {
    1259             : 
    1260             :   SmallVector<SDValue, 8> Args;
    1261       33370 :   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    1262       16685 :   EVT VT = Op.getValueType();
    1263       33370 :   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    1264             :                             VT.getVectorNumElements());
    1265             : 
    1266       50055 :   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    1267             : }
    1268             : 
    1269             : /// Generate Min/Max node
    1270         741 : SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
    1271             :                                                    SDValue LHS, SDValue RHS,
    1272             :                                                    SDValue True, SDValue False,
    1273             :                                                    SDValue CC,
    1274             :                                                    DAGCombinerInfo &DCI) const {
    1275             :   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
    1276         646 :     return SDValue();
    1277             : 
    1278          95 :   SelectionDAG &DAG = DCI.DAG;
    1279          95 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    1280          95 :   switch (CCOpcode) {
    1281             :   case ISD::SETOEQ:
    1282             :   case ISD::SETONE:
    1283             :   case ISD::SETUNE:
    1284             :   case ISD::SETNE:
    1285             :   case ISD::SETUEQ:
    1286             :   case ISD::SETEQ:
    1287             :   case ISD::SETFALSE:
    1288             :   case ISD::SETFALSE2:
    1289             :   case ISD::SETTRUE:
    1290             :   case ISD::SETTRUE2:
    1291             :   case ISD::SETUO:
    1292             :   case ISD::SETO:
    1293             :     break;
    1294             :   case ISD::SETULE:
    1295             :   case ISD::SETULT: {
    1296             :     if (LHS == True)
    1297          14 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1298           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1299             :   }
    1300           4 :   case ISD::SETOLE:
    1301             :   case ISD::SETOLT:
    1302             :   case ISD::SETLE:
    1303             :   case ISD::SETLT: {
    1304             :     // Ordered. Assume ordered for undefined.
    1305             : 
    1306             :     // Only do this after legalization to avoid interfering with other combines
    1307             :     // which might occur.
    1308           6 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1309           2 :         !DCI.isCalledByLegalizer())
    1310           2 :       return SDValue();
    1311             : 
    1312             :     // We need to permute the operands to get the correct NaN behavior. The
    1313             :     // selected operand is the second one based on the failing compare with NaN,
    1314             :     // so permute it based on the compare type the hardware uses.
    1315             :     if (LHS == True)
    1316           2 :       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1317           0 :     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1318             :   }
    1319             :   case ISD::SETUGE:
    1320             :   case ISD::SETUGT: {
    1321             :     if (LHS == True)
    1322          10 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
    1323           1 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    1324             :   }
    1325          39 :   case ISD::SETGT:
    1326             :   case ISD::SETGE:
    1327             :   case ISD::SETOGE:
    1328             :   case ISD::SETOGT: {
    1329          72 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    1330          33 :         !DCI.isCalledByLegalizer())
    1331          14 :       return SDValue();
    1332             : 
    1333             :     if (LHS == True)
    1334          16 :       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    1335           9 :     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    1336             :   }
    1337           0 :   case ISD::SETCC_INVALID:
    1338           0 :     llvm_unreachable("Invalid setcc condcode!");
    1339             :   }
    1340          27 :   return SDValue();
    1341             : }
    1342             : 
    1343             : std::pair<SDValue, SDValue>
    1344        2512 : AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
    1345             :   SDLoc SL(Op);
    1346             : 
    1347        2512 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1348             : 
    1349        2512 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1350        2512 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1351             : 
    1352        2512 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1353        2512 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1354             : 
    1355        2512 :   return std::make_pair(Lo, Hi);
    1356             : }
    1357             : 
    1358           0 : SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
    1359             :   SDLoc SL(Op);
    1360             : 
    1361           0 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1362           0 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    1363           0 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    1364             : }
    1365             : 
    1366         132 : SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
    1367             :   SDLoc SL(Op);
    1368             : 
    1369         132 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
    1370         132 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    1371         264 :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    1372             : }
    1373             : 
    1374        2367 : SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
    1375             :                                               SelectionDAG &DAG) const {
    1376             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1377        4734 :   EVT VT = Op.getValueType();
    1378             : 
    1379             : 
    1380             :   // If this is a 2 element vector, we really want to scalarize and not create
    1381             :   // weird 1 element vectors.
    1382        2367 :   if (VT.getVectorNumElements() == 2)
    1383           0 :     return scalarizeVectorLoad(Load, DAG);
    1384             : 
    1385        2367 :   SDValue BasePtr = Load->getBasePtr();
    1386        2367 :   EVT MemVT = Load->getMemoryVT();
    1387             :   SDLoc SL(Op);
    1388             : 
    1389        2367 :   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
    1390             : 
    1391        2367 :   EVT LoVT, HiVT;
    1392        2367 :   EVT LoMemVT, HiMemVT;
    1393             :   SDValue Lo, Hi;
    1394             : 
    1395        4734 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1396        4734 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1397        4734 :   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
    1398             : 
    1399             :   unsigned Size = LoMemVT.getStoreSize();
    1400        2367 :   unsigned BaseAlign = Load->getAlignment();
    1401        4734 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1402             : 
    1403             :   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
    1404             :                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
    1405        4734 :                                   BaseAlign, Load->getMemOperand()->getFlags());
    1406        2367 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
    1407             :   SDValue HiLoad =
    1408             :       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
    1409             :                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
    1410        7101 :                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
    1411             : 
    1412             :   SDValue Ops[] = {
    1413        2367 :     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
    1414             :     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    1415        4734 :                 LoLoad.getValue(1), HiLoad.getValue(1))
    1416        4734 :   };
    1417             : 
    1418        2367 :   return DAG.getMergeValues(Ops, SL);
    1419             : }
    1420             : 
    1421        7586 : SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
    1422             :                                                SelectionDAG &DAG) const {
    1423             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    1424        7586 :   SDValue Val = Store->getValue();
    1425       15172 :   EVT VT = Val.getValueType();
    1426             : 
    1427             :   // If this is a 2 element vector, we really want to scalarize and not create
    1428             :   // weird 1 element vectors.
    1429        7586 :   if (VT.getVectorNumElements() == 2)
    1430           0 :     return scalarizeVectorStore(Store, DAG);
    1431             : 
    1432        7586 :   EVT MemVT = Store->getMemoryVT();
    1433        7586 :   SDValue Chain = Store->getChain();
    1434        7586 :   SDValue BasePtr = Store->getBasePtr();
    1435             :   SDLoc SL(Op);
    1436             : 
    1437        7586 :   EVT LoVT, HiVT;
    1438        7586 :   EVT LoMemVT, HiMemVT;
    1439             :   SDValue Lo, Hi;
    1440             : 
    1441       15172 :   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
    1442       15172 :   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
    1443       15172 :   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
    1444             : 
    1445        7586 :   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
    1446             : 
    1447        7586 :   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
    1448             :   unsigned BaseAlign = Store->getAlignment();
    1449             :   unsigned Size = LoMemVT.getStoreSize();
    1450       15172 :   unsigned HiAlign = MinAlign(BaseAlign, Size);
    1451             : 
    1452             :   SDValue LoStore =
    1453             :       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
    1454        7586 :                         Store->getMemOperand()->getFlags());
    1455             :   SDValue HiStore =
    1456             :       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
    1457        7586 :                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
    1458             : 
    1459        7586 :   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
    1460             : }
    1461             : 
    1462             : // This is a shortcut for integer division because we have fast i32<->f32
    1463             : // conversions, and fast f32 reciprocal instructions. The fractional part of a
    1464             : // float is enough to accurately represent up to a 24-bit signed integer.
    1465         410 : SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
    1466             :                                             bool Sign) const {
    1467             :   SDLoc DL(Op);
    1468         410 :   EVT VT = Op.getValueType();
    1469         410 :   SDValue LHS = Op.getOperand(0);
    1470         410 :   SDValue RHS = Op.getOperand(1);
    1471             :   MVT IntVT = MVT::i32;
    1472             :   MVT FltVT = MVT::f32;
    1473             : 
    1474         410 :   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
    1475         410 :   if (LHSSignBits < 9)
    1476         308 :     return SDValue();
    1477             : 
    1478         102 :   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
    1479         102 :   if (RHSSignBits < 9)
    1480          13 :     return SDValue();
    1481             : 
    1482          89 :   unsigned BitSize = VT.getSizeInBits();
    1483          89 :   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
    1484          89 :   unsigned DivBits = BitSize - SignBits;
    1485          89 :   if (Sign)
    1486          48 :     ++DivBits;
    1487             : 
    1488          89 :   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
    1489          89 :   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
    1490             : 
    1491          89 :   SDValue jq = DAG.getConstant(1, DL, IntVT);
    1492             : 
    1493          89 :   if (Sign) {
    1494             :     // char|short jq = ia ^ ib;
    1495          48 :     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
    1496             : 
    1497             :     // jq = jq >> (bitsize - 2)
    1498          48 :     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
    1499          96 :                      DAG.getConstant(BitSize - 2, DL, VT));
    1500             : 
    1501             :     // jq = jq | 0x1
    1502          48 :     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
    1503             :   }
    1504             : 
    1505             :   // int ia = (int)LHS;
    1506          89 :   SDValue ia = LHS;
    1507             : 
    1508             :   // int ib, (int)RHS;
    1509          89 :   SDValue ib = RHS;
    1510             : 
    1511             :   // float fa = (float)ia;
    1512          89 :   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
    1513             : 
    1514             :   // float fb = (float)ib;
    1515          89 :   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
    1516             : 
    1517             :   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
    1518         178 :                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
    1519             : 
    1520             :   // fq = trunc(fq);
    1521          89 :   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
    1522             : 
    1523             :   // float fqneg = -fq;
    1524          89 :   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
    1525             : 
    1526             :   // float fr = mad(fqneg, fb, fa);
    1527          89 :   unsigned OpCode = Subtarget->hasFP32Denormals() ?
    1528             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1529             :                     (unsigned)ISD::FMAD;
    1530          89 :   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
    1531             : 
    1532             :   // int iq = (int)fq;
    1533          89 :   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
    1534             : 
    1535             :   // fr = fabs(fr);
    1536          89 :   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
    1537             : 
    1538             :   // fb = fabs(fb);
    1539          89 :   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
    1540             : 
    1541         178 :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    1542             : 
    1543             :   // int cv = fr >= fb;
    1544          89 :   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
    1545             : 
    1546             :   // jq = (cv ? jq : 0);
    1547          89 :   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
    1548             : 
    1549             :   // dst = iq + jq;
    1550          89 :   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
    1551             : 
    1552             :   // Rem needs compensation, it's easier to recompute it
    1553          89 :   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
    1554          89 :   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
    1555             : 
    1556             :   // Truncate to number of bits this divide really is.
    1557          89 :   if (Sign) {
    1558             :     SDValue InRegSize
    1559          48 :       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
    1560          48 :     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
    1561          48 :     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
    1562             :   } else {
    1563          41 :     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
    1564          41 :     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
    1565          41 :     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
    1566             :   }
    1567             : 
    1568         178 :   return DAG.getMergeValues({ Div, Rem }, DL);
    1569             : }
    1570             : 
    1571          72 : void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
    1572             :                                       SelectionDAG &DAG,
    1573             :                                       SmallVectorImpl<SDValue> &Results) const {
    1574             :   SDLoc DL(Op);
    1575          72 :   EVT VT = Op.getValueType();
    1576             : 
    1577             :   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
    1578             : 
    1579          72 :   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1580             : 
    1581          72 :   SDValue One = DAG.getConstant(1, DL, HalfVT);
    1582          72 :   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
    1583             : 
    1584             :   //HiLo split
    1585          72 :   SDValue LHS = Op.getOperand(0);
    1586          72 :   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1587          72 :   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
    1588             : 
    1589          72 :   SDValue RHS = Op.getOperand(1);
    1590          72 :   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1591          72 :   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
    1592             : 
    1593         160 :   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
    1594         144 :       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
    1595             : 
    1596             :     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1597          16 :                               LHS_Lo, RHS_Lo);
    1598             : 
    1599          32 :     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
    1600          32 :     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
    1601             : 
    1602          32 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
    1603          16 :     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
    1604             :     return;
    1605             :   }
    1606             : 
    1607          56 :   if (isTypeLegal(MVT::i64)) {
    1608             :     // Compute denominator reciprocal.
    1609          38 :     unsigned FMAD = Subtarget->hasFP32Denormals() ?
    1610             :                     (unsigned)AMDGPUISD::FMAD_FTZ :
    1611             :                     (unsigned)ISD::FMAD;
    1612             : 
    1613          38 :     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
    1614          38 :     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
    1615             :     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
    1616          38 :       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
    1617         114 :       Cvt_Lo);
    1618          38 :     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
    1619             :     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
    1620         114 :       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
    1621             :     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
    1622         114 :       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
    1623          38 :     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
    1624             :     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
    1625          38 :       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
    1626         114 :       Mul1);
    1627          38 :     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
    1628          38 :     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
    1629             :     SDValue Rcp64 = DAG.getBitcast(VT,
    1630          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
    1631             : 
    1632          38 :     SDValue Zero64 = DAG.getConstant(0, DL, VT);
    1633          38 :     SDValue One64  = DAG.getConstant(1, DL, VT);
    1634          38 :     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
    1635          38 :     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
    1636             : 
    1637          38 :     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
    1638          38 :     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
    1639          38 :     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
    1640             :     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1641          38 :                                     Zero);
    1642             :     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
    1643          38 :                                     One);
    1644             : 
    1645             :     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
    1646          38 :                                   Mulhi1_Lo, Zero1);
    1647             :     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
    1648          38 :                                   Mulhi1_Hi, Add1_Lo.getValue(1));
    1649          38 :     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
    1650             :     SDValue Add1 = DAG.getBitcast(VT,
    1651          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
    1652             : 
    1653          38 :     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
    1654          38 :     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
    1655             :     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1656          38 :                                     Zero);
    1657             :     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
    1658          38 :                                     One);
    1659             : 
    1660             :     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
    1661          38 :                                   Mulhi2_Lo, Zero1);
    1662             :     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
    1663          38 :                                    Mulhi2_Hi, Add1_Lo.getValue(1));
    1664             :     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
    1665          38 :                                   Zero, Add2_Lo.getValue(1));
    1666             :     SDValue Add2 = DAG.getBitcast(VT,
    1667          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
    1668          38 :     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
    1669             : 
    1670          38 :     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
    1671             : 
    1672          38 :     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
    1673          38 :     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
    1674             :     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
    1675          38 :                                   Mul3_Lo, Zero1);
    1676             :     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
    1677          38 :                                   Mul3_Hi, Sub1_Lo.getValue(1));
    1678          38 :     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
    1679             :     SDValue Sub1 = DAG.getBitcast(VT,
    1680          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
    1681             : 
    1682          38 :     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
    1683             :     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
    1684          38 :                                  ISD::SETUGE);
    1685             :     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
    1686          38 :                                  ISD::SETUGE);
    1687          38 :     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
    1688             : 
    1689             :     // TODO: Here and below portions of the code can be enclosed into if/endif.
    1690             :     // Currently control flow is unconditional and we have 4 selects after
    1691             :     // potential endif to substitute PHIs.
    1692             : 
    1693             :     // if C3 != 0 ...
    1694             :     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
    1695          38 :                                   RHS_Lo, Zero1);
    1696             :     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
    1697          38 :                                   RHS_Hi, Sub1_Lo.getValue(1));
    1698             :     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1699          38 :                                   Zero, Sub2_Lo.getValue(1));
    1700             :     SDValue Sub2 = DAG.getBitcast(VT,
    1701          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
    1702             : 
    1703          38 :     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
    1704             : 
    1705             :     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
    1706          38 :                                  ISD::SETUGE);
    1707             :     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
    1708          38 :                                  ISD::SETUGE);
    1709          38 :     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
    1710             : 
    1711             :     // if (C6 != 0)
    1712          38 :     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
    1713             : 
    1714             :     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
    1715          38 :                                   RHS_Lo, Zero1);
    1716             :     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
    1717          38 :                                   RHS_Hi, Sub2_Lo.getValue(1));
    1718             :     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
    1719          38 :                                   Zero, Sub3_Lo.getValue(1));
    1720             :     SDValue Sub3 = DAG.getBitcast(VT,
    1721          76 :                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
    1722             : 
    1723             :     // endif C6
    1724             :     // endif C3
    1725             : 
    1726          38 :     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
    1727          38 :     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
    1728             : 
    1729          38 :     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
    1730          38 :     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
    1731             : 
    1732          38 :     Results.push_back(Div);
    1733          38 :     Results.push_back(Rem);
    1734             : 
    1735             :     return;
    1736             :   }
    1737             : 
    1738             :   // r600 expandion.
    1739             :   // Get Speculative values
    1740          18 :   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    1741          18 :   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
    1742             : 
    1743          18 :   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
    1744          36 :   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
    1745          18 :   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
    1746             : 
    1747          18 :   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
    1748          18 :   SDValue DIV_Lo = Zero;
    1749             : 
    1750          18 :   const unsigned halfBitWidth = HalfVT.getSizeInBits();
    1751             : 
    1752        1170 :   for (unsigned i = 0; i < halfBitWidth; ++i) {
    1753         576 :     const unsigned bitPos = halfBitWidth - i - 1;
    1754         576 :     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
    1755             :     // Get value of high bit
    1756         576 :     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
    1757         576 :     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
    1758         576 :     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
    1759             : 
    1760             :     // Shift
    1761         576 :     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
    1762             :     // Add LHS high bit
    1763         576 :     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
    1764             : 
    1765         576 :     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
    1766         576 :     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
    1767             : 
    1768         576 :     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
    1769             : 
    1770             :     // Update REM
    1771         576 :     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
    1772         576 :     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
    1773             :   }
    1774             : 
    1775          36 :   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
    1776          18 :   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
    1777          18 :   Results.push_back(DIV);
    1778          18 :   Results.push_back(REM);
    1779             : }
    1780             : 
    1781         326 : SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
    1782             :                                            SelectionDAG &DAG) const {
    1783             :   SDLoc DL(Op);
    1784         326 :   EVT VT = Op.getValueType();
    1785             : 
    1786             :   if (VT == MVT::i64) {
    1787             :     SmallVector<SDValue, 2> Results;
    1788          50 :     LowerUDIVREM64(Op, DAG, Results);
    1789          50 :     return DAG.getMergeValues(Results, DL);
    1790             :   }
    1791             : 
    1792             :   if (VT == MVT::i32) {
    1793         276 :     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
    1794          41 :       return Res;
    1795             :   }
    1796             : 
    1797         235 :   SDValue Num = Op.getOperand(0);
    1798         235 :   SDValue Den = Op.getOperand(1);
    1799             : 
    1800             :   // RCP =  URECIP(Den) = 2^32 / Den + e
    1801             :   // e is rounding error.
    1802         235 :   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
    1803             : 
    1804             :   // RCP_LO = mul(RCP, Den) */
    1805         235 :   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
    1806             : 
    1807             :   // RCP_HI = mulhu (RCP, Den) */
    1808         235 :   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
    1809             : 
    1810             :   // NEG_RCP_LO = -RCP_LO
    1811             :   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
    1812         235 :                                                      RCP_LO);
    1813             : 
    1814             :   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
    1815             :   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1816             :                                            NEG_RCP_LO, RCP_LO,
    1817         235 :                                            ISD::SETEQ);
    1818             :   // Calculate the rounding error from the URECIP instruction
    1819             :   // E = mulhu(ABS_RCP_LO, RCP)
    1820         235 :   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
    1821             : 
    1822             :   // RCP_A_E = RCP + E
    1823         235 :   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
    1824             : 
    1825             :   // RCP_S_E = RCP - E
    1826         235 :   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
    1827             : 
    1828             :   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
    1829             :   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
    1830             :                                      RCP_A_E, RCP_S_E,
    1831         235 :                                      ISD::SETEQ);
    1832             :   // Quotient = mulhu(Tmp0, Num)
    1833         235 :   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
    1834             : 
    1835             :   // Num_S_Remainder = Quotient * Den
    1836         235 :   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
    1837             : 
    1838             :   // Remainder = Num - Num_S_Remainder
    1839         235 :   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
    1840             : 
    1841             :   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
    1842             :   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
    1843             :                                                  DAG.getConstant(-1, DL, VT),
    1844             :                                                  DAG.getConstant(0, DL, VT),
    1845         235 :                                                  ISD::SETUGE);
    1846             :   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
    1847             :   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
    1848             :                                                   Num_S_Remainder,
    1849             :                                                   DAG.getConstant(-1, DL, VT),
    1850             :                                                   DAG.getConstant(0, DL, VT),
    1851         235 :                                                   ISD::SETUGE);
    1852             :   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
    1853             :   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
    1854         235 :                                                Remainder_GE_Zero);
    1855             : 
    1856             :   // Calculate Division result:
    1857             : 
    1858             :   // Quotient_A_One = Quotient + 1
    1859             :   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
    1860         235 :                                        DAG.getConstant(1, DL, VT));
    1861             : 
    1862             :   // Quotient_S_One = Quotient - 1
    1863             :   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
    1864         235 :                                        DAG.getConstant(1, DL, VT));
    1865             : 
    1866             :   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
    1867             :   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1868         235 :                                      Quotient, Quotient_A_One, ISD::SETEQ);
    1869             : 
    1870             :   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
    1871         235 :   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1872         235 :                             Quotient_S_One, Div, ISD::SETEQ);
    1873             : 
    1874             :   // Calculate Rem result:
    1875             : 
    1876             :   // Remainder_S_Den = Remainder - Den
    1877         235 :   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
    1878             : 
    1879             :   // Remainder_A_Den = Remainder + Den
    1880         235 :   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
    1881             : 
    1882             :   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
    1883             :   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
    1884         235 :                                     Remainder, Remainder_S_Den, ISD::SETEQ);
    1885             : 
    1886             :   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
    1887         235 :   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
    1888         235 :                             Remainder_A_Den, Rem, ISD::SETEQ);
    1889             :   SDValue Ops[2] = {
    1890             :     Div,
    1891             :     Rem
    1892         235 :   };
    1893         235 :   return DAG.getMergeValues(Ops, DL);
    1894             : }
    1895             : 
    1896         170 : SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
    1897             :                                            SelectionDAG &DAG) const {
    1898             :   SDLoc DL(Op);
    1899         170 :   EVT VT = Op.getValueType();
    1900             : 
    1901         170 :   SDValue LHS = Op.getOperand(0);
    1902         170 :   SDValue RHS = Op.getOperand(1);
    1903             : 
    1904         170 :   SDValue Zero = DAG.getConstant(0, DL, VT);
    1905         170 :   SDValue NegOne = DAG.getConstant(-1, DL, VT);
    1906             : 
    1907             :   if (VT == MVT::i32) {
    1908         134 :     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
    1909          48 :       return Res;
    1910             :   }
    1911             : 
    1912          36 :   if (VT == MVT::i64 &&
    1913          48 :       DAG.ComputeNumSignBits(LHS) > 32 &&
    1914          12 :       DAG.ComputeNumSignBits(RHS) > 32) {
    1915          12 :     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    1916             : 
    1917             :     //HiLo split
    1918          12 :     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
    1919          12 :     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
    1920             :     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
    1921          12 :                                  LHS_Lo, RHS_Lo);
    1922             :     SDValue Res[2] = {
    1923          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
    1924          12 :       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
    1925          24 :     };
    1926          12 :     return DAG.getMergeValues(Res, DL);
    1927             :   }
    1928             : 
    1929         110 :   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
    1930         110 :   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
    1931         110 :   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
    1932         110 :   SDValue RSign = LHSign; // Remainder sign is the same as LHS
    1933             : 
    1934         110 :   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
    1935         110 :   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
    1936             : 
    1937         110 :   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
    1938         110 :   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
    1939             : 
    1940         110 :   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
    1941         110 :   SDValue Rem = Div.getValue(1);
    1942             : 
    1943         110 :   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
    1944         110 :   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
    1945             : 
    1946         110 :   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
    1947         110 :   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
    1948             : 
    1949             :   SDValue Res[2] = {
    1950             :     Div,
    1951             :     Rem
    1952         110 :   };
    1953         110 :   return DAG.getMergeValues(Res, DL);
    1954             : }
    1955             : 
    1956             : // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
    1957          36 : SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
    1958             :   SDLoc SL(Op);
    1959          36 :   EVT VT = Op.getValueType();
    1960          36 :   SDValue X = Op.getOperand(0);
    1961          36 :   SDValue Y = Op.getOperand(1);
    1962             : 
    1963             :   // TODO: Should this propagate fast-math-flags?
    1964             : 
    1965          36 :   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
    1966          36 :   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
    1967          36 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
    1968             : 
    1969          72 :   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
    1970             : }
    1971             : 
    1972          31 : SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
    1973             :   SDLoc SL(Op);
    1974          31 :   SDValue Src = Op.getOperand(0);
    1975             : 
    1976             :   // result = trunc(src)
    1977             :   // if (src > 0.0 && src != result)
    1978             :   //   result += 1.0
    1979             : 
    1980          31 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    1981             : 
    1982          31 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    1983          31 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    1984             : 
    1985             :   EVT SetCCVT =
    1986          93 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    1987             : 
    1988          31 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
    1989          31 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    1990          31 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    1991             : 
    1992          31 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
    1993             :   // TODO: Should this propagate fast-math-flags?
    1994          62 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    1995             : }
    1996             : 
    1997          91 : static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
    1998             :                                   SelectionDAG &DAG) {
    1999             :   const unsigned FractBits = 52;
    2000             :   const unsigned ExpBits = 11;
    2001             : 
    2002             :   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    2003             :                                 Hi,
    2004             :                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
    2005         273 :                                 DAG.getConstant(ExpBits, SL, MVT::i32));
    2006             :   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
    2007         182 :                             DAG.getConstant(1023, SL, MVT::i32));
    2008             : 
    2009          91 :   return Exp;
    2010             : }
    2011             : 
    2012          75 : SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    2013             :   SDLoc SL(Op);
    2014          75 :   SDValue Src = Op.getOperand(0);
    2015             : 
    2016             :   assert(Op.getValueType() == MVT::f64);
    2017             : 
    2018          75 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2019          75 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2020             : 
    2021          75 :   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2022             : 
    2023             :   // Extract the upper half, since this is where we will find the sign and
    2024             :   // exponent.
    2025          75 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
    2026             : 
    2027          75 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    2028             : 
    2029             :   const unsigned FractBits = 52;
    2030             : 
    2031             :   // Extract the sign bit.
    2032          75 :   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
    2033          75 :   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
    2034             : 
    2035             :   // Extend back to 64-bits.
    2036         150 :   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
    2037          75 :   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
    2038             : 
    2039          75 :   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
    2040             :   const SDValue FractMask
    2041          75 :     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
    2042             : 
    2043          75 :   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
    2044          75 :   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
    2045          75 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
    2046             : 
    2047             :   EVT SetCCVT =
    2048         225 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2049             : 
    2050          75 :   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
    2051             : 
    2052          75 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2053          75 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2054             : 
    2055          75 :   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
    2056          75 :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
    2057             : 
    2058         150 :   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
    2059             : }
    2060             : 
    2061          14 : SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
    2062             :   SDLoc SL(Op);
    2063          14 :   SDValue Src = Op.getOperand(0);
    2064             : 
    2065             :   assert(Op.getValueType() == MVT::f64);
    2066             : 
    2067          14 :   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
    2068          14 :   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
    2069          14 :   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
    2070             : 
    2071             :   // TODO: Should this propagate fast-math-flags?
    2072             : 
    2073          14 :   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
    2074          14 :   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
    2075             : 
    2076          14 :   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
    2077             : 
    2078          14 :   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
    2079          14 :   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
    2080             : 
    2081             :   EVT SetCCVT =
    2082          42 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2083          14 :   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
    2084             : 
    2085          28 :   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
    2086             : }
    2087             : 
    2088          45 : SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
    2089             :   // FNEARBYINT and FRINT are the same, except in their handling of FP
    2090             :   // exceptions. Those aren't really meaningful for us, and OpenCL only has
    2091             :   // rint, so just treat them as equivalent.
    2092          90 :   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
    2093             : }
    2094             : 
    2095             : // XXX - May require not supporting f32 denormals?
    2096             : 
    2097             : // Don't handle v2f16. The extra instructions to scalarize and repack around the
    2098             : // compare and vselect end up producing worse code than scalarizing the whole
    2099             : // operation.
    2100          74 : SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
    2101             :   SDLoc SL(Op);
    2102          74 :   SDValue X = Op.getOperand(0);
    2103          74 :   EVT VT = Op.getValueType();
    2104             : 
    2105          74 :   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
    2106             : 
    2107             :   // TODO: Should this propagate fast-math-flags?
    2108             : 
    2109          74 :   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
    2110             : 
    2111          74 :   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
    2112             : 
    2113          74 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
    2114          74 :   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
    2115          74 :   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
    2116             : 
    2117          74 :   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
    2118             : 
    2119             :   EVT SetCCVT =
    2120         148 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
    2121             : 
    2122          74 :   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
    2123             : 
    2124          74 :   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
    2125             : 
    2126         148 :   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
    2127             : }
    2128             : 
    2129          16 : SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
    2130             :   SDLoc SL(Op);
    2131          16 :   SDValue X = Op.getOperand(0);
    2132             : 
    2133          16 :   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
    2134             : 
    2135          16 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2136          16 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2137          16 :   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
    2138          16 :   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
    2139             :   EVT SetCCVT =
    2140          48 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
    2141             : 
    2142          16 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    2143             : 
    2144          16 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
    2145             : 
    2146          16 :   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
    2147             : 
    2148             :   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
    2149          16 :                                        MVT::i64);
    2150             : 
    2151          16 :   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
    2152             :   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
    2153             :                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
    2154             :                                           MVT::i64),
    2155          32 :                           Exp);
    2156             : 
    2157          16 :   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
    2158             :   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
    2159             :                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
    2160          16 :                               ISD::SETNE);
    2161             : 
    2162             :   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
    2163          32 :                              D, DAG.getConstant(0, SL, MVT::i64));
    2164          16 :   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
    2165             : 
    2166          32 :   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
    2167          16 :   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
    2168             : 
    2169          16 :   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
    2170          16 :   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
    2171          16 :   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
    2172             : 
    2173             :   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
    2174             :                             ExpEqNegOne,
    2175             :                             DAG.getConstantFP(1.0, SL, MVT::f64),
    2176          48 :                             DAG.getConstantFP(0.0, SL, MVT::f64));
    2177             : 
    2178          16 :   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
    2179             : 
    2180          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
    2181          16 :   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
    2182             : 
    2183          32 :   return K;
    2184             : }
    2185             : 
    2186          90 : SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
    2187          90 :   EVT VT = Op.getValueType();
    2188             : 
    2189             :   if (VT == MVT::f32 || VT == MVT::f16)
    2190          74 :     return LowerFROUND32_16(Op, DAG);
    2191             : 
    2192             :   if (VT == MVT::f64)
    2193          16 :     return LowerFROUND64(Op, DAG);
    2194             : 
    2195           0 :   llvm_unreachable("unhandled type");
    2196             : }
    2197             : 
    2198           0 : SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
    2199             :   SDLoc SL(Op);
    2200           0 :   SDValue Src = Op.getOperand(0);
    2201             : 
    2202             :   // result = trunc(src);
    2203             :   // if (src < 0.0 && src != result)
    2204             :   //   result += -1.0.
    2205             : 
    2206           0 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2207             : 
    2208           0 :   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    2209           0 :   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
    2210             : 
    2211             :   EVT SetCCVT =
    2212           0 :       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    2213             : 
    2214           0 :   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
    2215           0 :   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    2216           0 :   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
    2217             : 
    2218           0 :   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
    2219             :   // TODO: Should this propagate fast-math-flags?
    2220           0 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
    2221             : }
    2222             : 
    2223          74 : SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
    2224             :                                         double Log2BaseInverted) const {
    2225          74 :   EVT VT = Op.getValueType();
    2226             : 
    2227             :   SDLoc SL(Op);
    2228          74 :   SDValue Operand = Op.getOperand(0);
    2229          74 :   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
    2230          74 :   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
    2231             : 
    2232         148 :   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
    2233             : }
    2234             : 
    2235             : static bool isCtlzOpc(unsigned Opc) {
    2236        3438 :   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
    2237             : }
    2238             : 
    2239             : static bool isCttzOpc(unsigned Opc) {
    2240        4283 :   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
    2241             : }
    2242             : 
    2243         415 : SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
    2244             :   SDLoc SL(Op);
    2245         415 :   SDValue Src = Op.getOperand(0);
    2246         415 :   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
    2247             :                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
    2248             : 
    2249             :   unsigned ISDOpc, NewOpc;
    2250         415 :   if (isCtlzOpc(Op.getOpcode())) {
    2251             :     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
    2252             :     NewOpc = AMDGPUISD::FFBH_U32;
    2253          73 :   } else if (isCttzOpc(Op.getOpcode())) {
    2254             :     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
    2255             :     NewOpc = AMDGPUISD::FFBL_B32;
    2256             :   } else
    2257           0 :     llvm_unreachable("Unexpected OPCode!!!");
    2258             : 
    2259             : 
    2260         415 :   if (ZeroUndef && Src.getValueType() == MVT::i32)
    2261         329 :     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
    2262             : 
    2263          86 :   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2264             : 
    2265          86 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2266          86 :   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2267             : 
    2268          86 :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
    2269          86 :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
    2270             : 
    2271             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2272         258 :                                    *DAG.getContext(), MVT::i32);
    2273             : 
    2274         172 :   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
    2275          86 :   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
    2276             : 
    2277          86 :   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
    2278          86 :   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
    2279             : 
    2280          86 :   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
    2281          86 :   SDValue Add, NewOpr;
    2282          86 :   if (isCtlzOpc(Op.getOpcode())) {
    2283          82 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
    2284             :     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
    2285          82 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
    2286             :   } else {
    2287           4 :     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
    2288             :     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
    2289           4 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
    2290             :   }
    2291             : 
    2292          86 :   if (!ZeroUndef) {
    2293             :     // Test if the full 64-bit input is zero.
    2294             : 
    2295             :     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
    2296             :     // which we probably don't want.
    2297           8 :     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
    2298           8 :     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
    2299           8 :     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
    2300             : 
    2301             :     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
    2302             :     // with the same cycles, otherwise it is slower.
    2303             :     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
    2304             :     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
    2305             : 
    2306           8 :     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
    2307             : 
    2308             :     // The instruction returns -1 for 0 input, but the defined intrinsic
    2309             :     // behavior is to return the number of bits.
    2310           8 :     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
    2311           8 :                          SrcIsZero, Bits32, NewOpr);
    2312             :   }
    2313             : 
    2314          86 :   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
    2315             : }
    2316             : 
    2317          67 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
    2318             :                                                bool Signed) const {
    2319             :   // Unsigned
    2320             :   // cul2f(ulong u)
    2321             :   //{
    2322             :   //  uint lz = clz(u);
    2323             :   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
    2324             :   //  u = (u << lz) & 0x7fffffffffffffffUL;
    2325             :   //  ulong t = u & 0xffffffffffUL;
    2326             :   //  uint v = (e << 23) | (uint)(u >> 40);
    2327             :   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
    2328             :   //  return as_float(v + r);
    2329             :   //}
    2330             :   // Signed
    2331             :   // cl2f(long l)
    2332             :   //{
    2333             :   //  long s = l >> 63;
    2334             :   //  float r = cul2f((l + s) ^ s);
    2335             :   //  return s ? -r : r;
    2336             :   //}
    2337             : 
    2338             :   SDLoc SL(Op);
    2339          67 :   SDValue Src = Op.getOperand(0);
    2340          67 :   SDValue L = Src;
    2341             : 
    2342          67 :   SDValue S;
    2343          67 :   if (Signed) {
    2344          32 :     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
    2345          32 :     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
    2346             : 
    2347          32 :     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
    2348          32 :     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
    2349             :   }
    2350             : 
    2351             :   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
    2352         201 :                                    *DAG.getContext(), MVT::f32);
    2353             : 
    2354             : 
    2355          67 :   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
    2356          67 :   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
    2357          67 :   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
    2358          67 :   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
    2359             : 
    2360          67 :   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
    2361             :   SDValue E = DAG.getSelect(SL, MVT::i32,
    2362             :     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
    2363             :     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
    2364         134 :     ZeroI32);
    2365             : 
    2366             :   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
    2367             :     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
    2368         201 :     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
    2369             : 
    2370             :   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
    2371         134 :                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
    2372             : 
    2373             :   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
    2374         134 :                              U, DAG.getConstant(40, SL, MVT::i64));
    2375             : 
    2376             :   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
    2377             :     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
    2378         268 :     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
    2379             : 
    2380          67 :   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
    2381          67 :   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
    2382          67 :   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
    2383             : 
    2384          67 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    2385             : 
    2386          67 :   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
    2387             : 
    2388             :   SDValue R = DAG.getSelect(SL, MVT::i32,
    2389             :     RCmp,
    2390             :     One,
    2391         134 :     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
    2392          67 :   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
    2393          67 :   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
    2394             : 
    2395          67 :   if (!Signed)
    2396          35 :     return R;
    2397             : 
    2398          32 :   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
    2399          64 :   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
    2400             : }
    2401             : 
    2402          10 : SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
    2403             :                                                bool Signed) const {
    2404             :   SDLoc SL(Op);
    2405          10 :   SDValue Src = Op.getOperand(0);
    2406             : 
    2407          10 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
    2408             : 
    2409             :   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2410          20 :                            DAG.getConstant(0, SL, MVT::i32));
    2411             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
    2412          20 :                            DAG.getConstant(1, SL, MVT::i32));
    2413             : 
    2414             :   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
    2415          10 :                               SL, MVT::f64, Hi);
    2416             : 
    2417          10 :   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
    2418             : 
    2419             :   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
    2420          20 :                               DAG.getConstant(32, SL, MVT::i32));
    2421             :   // TODO: Should this propagate fast-math-flags?
    2422          20 :   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
    2423             : }
    2424             : 
    2425          47 : SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
    2426             :                                                SelectionDAG &DAG) const {
    2427             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2428             :          "operation should be legal");
    2429             : 
    2430             :   // TODO: Factor out code common with LowerSINT_TO_FP.
    2431             : 
    2432             :   EVT DestVT = Op.getValueType();
    2433          47 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2434             :     SDLoc DL(Op);
    2435           4 :     SDValue Src = Op.getOperand(0);
    2436             : 
    2437           4 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2438           8 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2439             :     SDValue FPRound =
    2440           4 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2441             : 
    2442           4 :     return FPRound;
    2443             :   }
    2444             : 
    2445             :   if (DestVT == MVT::f32)
    2446          35 :     return LowerINT_TO_FP32(Op, DAG, false);
    2447             : 
    2448             :   assert(DestVT == MVT::f64);
    2449           8 :   return LowerINT_TO_FP64(Op, DAG, false);
    2450             : }
    2451             : 
    2452          38 : SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
    2453             :                                               SelectionDAG &DAG) const {
    2454             :   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
    2455             :          "operation should be legal");
    2456             : 
    2457             :   // TODO: Factor out code common with LowerUINT_TO_FP.
    2458             : 
    2459             :   EVT DestVT = Op.getValueType();
    2460          38 :   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
    2461             :     SDLoc DL(Op);
    2462           4 :     SDValue Src = Op.getOperand(0);
    2463             : 
    2464           4 :     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
    2465           8 :     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
    2466             :     SDValue FPRound =
    2467           4 :         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
    2468             : 
    2469           4 :     return FPRound;
    2470             :   }
    2471             : 
    2472             :   if (DestVT == MVT::f32)
    2473          32 :     return LowerINT_TO_FP32(Op, DAG, true);
    2474             : 
    2475             :   assert(DestVT == MVT::f64);
    2476           2 :   return LowerINT_TO_FP64(Op, DAG, true);
    2477             : }
    2478             : 
    2479          16 : SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
    2480             :                                                bool Signed) const {
    2481             :   SDLoc SL(Op);
    2482             : 
    2483          16 :   SDValue Src = Op.getOperand(0);
    2484             : 
    2485          16 :   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
    2486             : 
    2487             :   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
    2488          16 :                                  MVT::f64);
    2489             :   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
    2490          16 :                                  MVT::f64);
    2491             :   // TODO: Should this propagate fast-math-flags?
    2492          16 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
    2493             : 
    2494          16 :   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
    2495             : 
    2496             : 
    2497          16 :   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
    2498             : 
    2499             :   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
    2500          16 :                            MVT::i32, FloorMul);
    2501          16 :   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
    2502             : 
    2503          32 :   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
    2504             : 
    2505          32 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
    2506             : }
    2507             : 
    2508         797 : SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
    2509             :   SDLoc DL(Op);
    2510         797 :   SDValue N0 = Op.getOperand(0);
    2511             : 
    2512             :   // Convert to target node to get known bits
    2513             :   if (N0.getValueType() == MVT::f32)
    2514         748 :     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
    2515             : 
    2516          49 :   if (getTargetMachine().Options.UnsafeFPMath) {
    2517             :     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
    2518          10 :     return SDValue();
    2519             :   }
    2520             : 
    2521             :   assert(N0.getSimpleValueType() == MVT::f64);
    2522             : 
    2523             :   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
    2524             :   const unsigned ExpMask = 0x7ff;
    2525             :   const unsigned ExpBiasf64 = 1023;
    2526             :   const unsigned ExpBiasf16 = 15;
    2527          39 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    2528          39 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    2529          39 :   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
    2530             :   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
    2531          78 :                            DAG.getConstant(32, DL, MVT::i64));
    2532          39 :   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
    2533          39 :   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
    2534             :   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2535          78 :                           DAG.getConstant(20, DL, MVT::i64));
    2536          39 :   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
    2537          78 :                   DAG.getConstant(ExpMask, DL, MVT::i32));
    2538             :   // Subtract the fp64 exponent bias (1023) to get the real exponent and
    2539             :   // add the f16 bias (15) to get the biased exponent for the f16 format.
    2540          39 :   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
    2541          78 :                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
    2542             : 
    2543             :   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2544          78 :                           DAG.getConstant(8, DL, MVT::i32));
    2545          39 :   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
    2546          78 :                   DAG.getConstant(0xffe, DL, MVT::i32));
    2547             : 
    2548             :   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
    2549          78 :                                   DAG.getConstant(0x1ff, DL, MVT::i32));
    2550          39 :   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
    2551             : 
    2552          39 :   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
    2553          39 :   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
    2554             : 
    2555             :   // (M != 0 ? 0x0200 : 0) | 0x7c00;
    2556             :   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
    2557             :       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
    2558         117 :                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
    2559             : 
    2560             :   // N = M | (E << 12);
    2561             :   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2562             :       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
    2563         117 :                   DAG.getConstant(12, DL, MVT::i32)));
    2564             : 
    2565             :   // B = clamp(1-E, 0, 13);
    2566             :   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
    2567          39 :                                   One, E);
    2568          39 :   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
    2569          39 :   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
    2570          78 :                   DAG.getConstant(13, DL, MVT::i32));
    2571             : 
    2572             :   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
    2573          78 :                                    DAG.getConstant(0x1000, DL, MVT::i32));
    2574             : 
    2575          39 :   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
    2576          39 :   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
    2577          39 :   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
    2578          39 :   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
    2579             : 
    2580          39 :   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
    2581             :   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
    2582          78 :                               DAG.getConstant(0x7, DL, MVT::i32));
    2583          39 :   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
    2584          78 :                   DAG.getConstant(2, DL, MVT::i32));
    2585             :   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
    2586          39 :                                One, Zero, ISD::SETEQ);
    2587             :   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
    2588          39 :                                One, Zero, ISD::SETGT);
    2589          39 :   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
    2590          39 :   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
    2591             : 
    2592          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
    2593         117 :                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
    2594          39 :   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
    2595          78 :                       I, V, ISD::SETEQ);
    2596             : 
    2597             :   // Extract the sign bit.
    2598             :   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
    2599          78 :                             DAG.getConstant(16, DL, MVT::i32));
    2600          39 :   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
    2601          78 :                      DAG.getConstant(0x8000, DL, MVT::i32));
    2602             : 
    2603          39 :   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
    2604          39 :   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
    2605             : }
    2606             : 
    2607          65 : SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
    2608             :                                               SelectionDAG &DAG) const {
    2609          65 :   SDValue Src = Op.getOperand(0);
    2610             : 
    2611             :   // TODO: Factor out code common with LowerFP_TO_UINT.
    2612             : 
    2613             :   EVT SrcVT = Src.getValueType();
    2614          65 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2615             :     SDLoc DL(Op);
    2616             : 
    2617           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2618             :     SDValue FpToInt32 =
    2619           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2620             : 
    2621           3 :     return FpToInt32;
    2622             :   }
    2623             : 
    2624             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2625           2 :     return LowerFP64_TO_INT(Op, DAG, true);
    2626             : 
    2627          60 :   return SDValue();
    2628             : }
    2629             : 
    2630          37 : SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
    2631             :                                               SelectionDAG &DAG) const {
    2632          37 :   SDValue Src = Op.getOperand(0);
    2633             : 
    2634             :   // TODO: Factor out code common with LowerFP_TO_SINT.
    2635             : 
    2636             :   EVT SrcVT = Src.getValueType();
    2637          37 :   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
    2638             :     SDLoc DL(Op);
    2639             : 
    2640           3 :     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
    2641             :     SDValue FpToInt32 =
    2642           3 :         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
    2643             : 
    2644           3 :     return FpToInt32;
    2645             :   }
    2646             : 
    2647             :   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
    2648          14 :     return LowerFP64_TO_INT(Op, DAG, false);
    2649             : 
    2650          20 :   return SDValue();
    2651             : }
    2652             : 
    2653          16 : SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    2654             :                                                      SelectionDAG &DAG) const {
    2655          16 :   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
    2656          16 :   MVT VT = Op.getSimpleValueType();
    2657             :   MVT ScalarVT = VT.getScalarType();
    2658             : 
    2659             :   assert(VT.isVector());
    2660             : 
    2661          16 :   SDValue Src = Op.getOperand(0);
    2662             :   SDLoc DL(Op);
    2663             : 
    2664             :   // TODO: Don't scalarize on Evergreen?
    2665          16 :   unsigned NElts = VT.getVectorNumElements();
    2666             :   SmallVector<SDValue, 8> Args;
    2667          16 :   DAG.ExtractVectorElements(Src, Args, 0, NElts);
    2668             : 
    2669          16 :   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
    2670         116 :   for (unsigned I = 0; I < NElts; ++I)
    2671         150 :     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
    2672             : 
    2673          32 :   return DAG.getBuildVector(VT, DL, Args);
    2674             : }
    2675             : 
    2676             : //===----------------------------------------------------------------------===//
    2677             : // Custom DAG optimizations
    2678             : //===----------------------------------------------------------------------===//
    2679             : 
    2680             : static bool isU24(SDValue Op, SelectionDAG &DAG) {
    2681        8689 :   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
    2682             : }
    2683             : 
    2684        4306 : static bool isI24(SDValue Op, SelectionDAG &DAG) {
    2685        8612 :   EVT VT = Op.getValueType();
    2686        8612 :   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
    2687             :                                      // as unsigned 24-bit values.
    2688        8612 :     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
    2689             : }
    2690             : 
    2691        3141 : static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
    2692             :                         TargetLowering::DAGCombinerInfo &DCI) {
    2693             : 
    2694        3141 :   SelectionDAG &DAG = DCI.DAG;
    2695        6282 :   SDValue Op = Node24->getOperand(OpIdx);
    2696        3141 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    2697        3141 :   EVT VT = Op.getValueType();
    2698             : 
    2699        3141 :   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
    2700             :   APInt KnownZero, KnownOne;
    2701             :   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
    2702        3141 :   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
    2703             :     return true;
    2704             : 
    2705        2947 :   return false;
    2706             : }
    2707             : 
    2708             : template <typename IntTy>
    2709          48 : static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
    2710             :                                uint32_t Width, const SDLoc &DL) {
    2711          48 :   if (Width + Offset < 32) {
    2712          20 :     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
    2713          20 :     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
    2714          20 :     return DAG.getConstant(Result, DL, MVT::i32);
    2715             :   }
    2716             : 
    2717          28 :   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
    2718             : }
    2719             : 
    2720       92818 : static bool hasVolatileUser(SDNode *Val) {
    2721      284729 :   for (SDNode *U : Val->uses()) {
    2722       51348 :     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
    2723       51348 :       if (M->isVolatile())
    2724             :         return true;
    2725             :     }
    2726             :   }
    2727             : 
    2728             :   return false;
    2729             : }
    2730             : 
    2731      134471 : bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
    2732             :   // i32 vectors are the canonical memory type.
    2733      348405 :   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
    2734             :     return false;
    2735             : 
    2736       11806 :   if (!VT.isByteSized())
    2737             :     return false;
    2738             : 
    2739             :   unsigned Size = VT.getStoreSize();
    2740             : 
    2741       18345 :   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
    2742             :     return false;
    2743             : 
    2744        5090 :   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
    2745             :     return false;
    2746             : 
    2747        4310 :   return true;
    2748             : }
    2749             : 
    2750             : // Replace load of an illegal type with a store of a bitcast to a friendlier
    2751             : // type.
    2752      144086 : SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
    2753             :                                                  DAGCombinerInfo &DCI) const {
    2754      144086 :   if (!DCI.isBeforeLegalize())
    2755       34548 :     return SDValue();
    2756             : 
    2757             :   LoadSDNode *LN = cast<LoadSDNode>(N);
    2758      202356 :   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
    2759       20573 :     return SDValue();
    2760             : 
    2761             :   SDLoc SL(N);
    2762       88965 :   SelectionDAG &DAG = DCI.DAG;
    2763       88965 :   EVT VT = LN->getMemoryVT();
    2764             : 
    2765             :   unsigned Size = VT.getStoreSize();
    2766       88965 :   unsigned Align = LN->getAlignment();
    2767       88965 :   if (Align < Size && isTypeLegal(VT)) {
    2768             :     bool IsFast;
    2769             :     unsigned AS = LN->getAddressSpace();
    2770             : 
    2771             :     // Expand unaligned loads earlier than legalization. Due to visitation order
    2772             :     // problems during legalization, the emitted instructions to pack and unpack
    2773             :     // the bytes again are not eliminated in the case of an unaligned copy.
    2774       36028 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2775         351 :       if (VT.isVector())
    2776          60 :         return scalarizeVectorLoad(LN, DAG);
    2777             : 
    2778         291 :       SDValue Ops[2];
    2779         582 :       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
    2780         582 :       return DAG.getMergeValues(Ops, SDLoc(N));
    2781             :     }
    2782             : 
    2783       35677 :     if (!IsFast)
    2784          23 :       return SDValue();
    2785             :   }
    2786             : 
    2787       88591 :   if (!shouldCombineMemoryType(VT))
    2788       86433 :     return SDValue();
    2789             : 
    2790        2158 :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2791             : 
    2792             :   SDValue NewLoad
    2793             :     = DAG.getLoad(NewVT, SL, LN->getChain(),
    2794        4316 :                   LN->getBasePtr(), LN->getMemOperand());
    2795             : 
    2796        2158 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
    2797        2158 :   DCI.CombineTo(N, BC, NewLoad.getValue(1));
    2798        2158 :   return SDValue(N, 0);
    2799             : }
    2800             : 
    2801             : // Replace store of an illegal type with a store of a bitcast to a friendlier
    2802             : // type.
    2803       93951 : SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
    2804             :                                                   DAGCombinerInfo &DCI) const {
    2805       93951 :   if (!DCI.isBeforeLegalize())
    2806       34521 :     return SDValue();
    2807             : 
    2808             :   StoreSDNode *SN = cast<StoreSDNode>(N);
    2809       59430 :   if (SN->isVolatile() || !ISD::isNormalStore(SN))
    2810       13242 :     return SDValue();
    2811             : 
    2812       46188 :   EVT VT = SN->getMemoryVT();
    2813             :   unsigned Size = VT.getStoreSize();
    2814             : 
    2815             :   SDLoc SL(N);
    2816       46188 :   SelectionDAG &DAG = DCI.DAG;
    2817       46188 :   unsigned Align = SN->getAlignment();
    2818       46188 :   if (Align < Size && isTypeLegal(VT)) {
    2819             :     bool IsFast;
    2820             :     unsigned AS = SN->getAddressSpace();
    2821             : 
    2822             :     // Expand unaligned stores earlier than legalization. Due to visitation
    2823             :     // order problems during legalization, the emitted instructions to pack and
    2824             :     // unpack the bytes again are not eliminated in the case of an unaligned
    2825             :     // copy.
    2826        4301 :     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
    2827         308 :       if (VT.isVector())
    2828         381 :         return scalarizeVectorStore(SN, DAG);
    2829             : 
    2830         235 :       return expandUnalignedStore(SN, DAG);
    2831             :     }
    2832             : 
    2833        3993 :     if (!IsFast)
    2834           0 :       return SDValue();
    2835             :   }
    2836             : 
    2837       45880 :   if (!shouldCombineMemoryType(VT))
    2838       43728 :     return SDValue();
    2839             : 
    2840        2152 :   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    2841        2152 :   SDValue Val = SN->getValue();
    2842             : 
    2843             :   //DCI.AddToWorklist(Val.getNode());
    2844             : 
    2845             :   bool OtherUses = !Val.hasOneUse();
    2846        2152 :   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
    2847        2152 :   if (OtherUses) {
    2848          33 :     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
    2849          33 :     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
    2850             :   }
    2851             : 
    2852             :   return DAG.getStore(SN->getChain(), SL, CastVal,
    2853        4304 :                       SN->getBasePtr(), SN->getMemOperand());
    2854             : }
    2855             : 
    2856             : // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
    2857             : // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
    2858             : // issues.
    2859       11025 : SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
    2860             :                                                         DAGCombinerInfo &DCI) const {
    2861       11025 :   SelectionDAG &DAG = DCI.DAG;
    2862       11025 :   SDValue N0 = N->getOperand(0);
    2863             : 
    2864             :   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
    2865             :   //     (vt2 (truncate (assertzext vt0:x, vt1)))
    2866       11025 :   if (N0.getOpcode() == ISD::TRUNCATE) {
    2867          34 :     SDValue N1 = N->getOperand(1);
    2868          34 :     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
    2869             :     SDLoc SL(N);
    2870             : 
    2871          34 :     SDValue Src = N0.getOperand(0);
    2872          34 :     EVT SrcVT = Src.getValueType();
    2873          34 :     if (SrcVT.bitsGE(ExtVT)) {
    2874          68 :       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
    2875          68 :       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
    2876             :     }
    2877             :   }
    2878             : 
    2879       10991 :   return SDValue();
    2880             : }
    2881             : /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
    2882             : /// binary operation \p Opc to it with the corresponding constant operands.
    2883        1626 : SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
    2884             :   DAGCombinerInfo &DCI, const SDLoc &SL,
    2885             :   unsigned Opc, SDValue LHS,
    2886             :   uint32_t ValLo, uint32_t ValHi) const {
    2887        1626 :   SelectionDAG &DAG = DCI.DAG;
    2888             :   SDValue Lo, Hi;
    2889        3252 :   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
    2890             : 
    2891        1626 :   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
    2892        1626 :   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
    2893             : 
    2894        1626 :   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
    2895        1626 :   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
    2896             : 
    2897             :   // Re-visit the ands. It's possible we eliminated one of them and it could
    2898             :   // simplify the vector.
    2899        1626 :   DCI.AddToWorklist(Lo.getNode());
    2900        1626 :   DCI.AddToWorklist(Hi.getNode());
    2901             : 
    2902        3252 :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
    2903        1626 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2904             : }
    2905             : 
    2906       18678 : SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
    2907             :                                                 DAGCombinerInfo &DCI) const {
    2908       37356 :   EVT VT = N->getValueType(0);
    2909             : 
    2910       18678 :   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2911             :   if (!RHS)
    2912        3775 :     return SDValue();
    2913             : 
    2914       14903 :   SDValue LHS = N->getOperand(0);
    2915       29806 :   unsigned RHSVal = RHS->getZExtValue();
    2916       14903 :   if (!RHSVal)
    2917           0 :     return LHS;
    2918             : 
    2919             :   SDLoc SL(N);
    2920       14903 :   SelectionDAG &DAG = DCI.DAG;
    2921             : 
    2922       29806 :   switch (LHS->getOpcode()) {
    2923             :   default:
    2924             :     break;
    2925        4341 :   case ISD::ZERO_EXTEND:
    2926             :   case ISD::SIGN_EXTEND:
    2927             :   case ISD::ANY_EXTEND: {
    2928        4341 :     SDValue X = LHS->getOperand(0);
    2929             : 
    2930         574 :     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
    2931             :         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
    2932             :       // Prefer build_vector as the canonical form if packed types are legal.
    2933             :       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
    2934             :       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
    2935          26 :        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
    2936          13 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    2937             :     }
    2938             : 
    2939             :     // shl (ext x) => zext (shl x), if shift does not overflow int
    2940             :     if (VT != MVT::i64)
    2941             :       break;
    2942             :     KnownBits Known;
    2943        3765 :     DAG.computeKnownBits(X, Known);
    2944             :     unsigned LZ = Known.countMinLeadingZeros();
    2945        3765 :     if (LZ < RHSVal)
    2946             :       break;
    2947        2728 :     EVT XVT = X.getValueType();
    2948        2728 :     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
    2949        2728 :     return DAG.getZExtOrTrunc(Shl, SL, VT);
    2950             :   }
    2951             :   }
    2952             : 
    2953             :   if (VT != MVT::i64)
    2954       10356 :     return SDValue();
    2955             : 
    2956             :   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
    2957             : 
    2958             :   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
    2959             :   // common case, splitting this into a move and a 32-bit shift is faster and
    2960             :   // the same code size.
    2961        1806 :   if (RHSVal < 32)
    2962         994 :     return SDValue();
    2963             : 
    2964         812 :   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
    2965             : 
    2966         812 :   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
    2967         812 :   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
    2968             : 
    2969         812 :   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    2970             : 
    2971        1624 :   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
    2972         812 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    2973             : }
    2974             : 
    2975        5869 : SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
    2976             :                                                 DAGCombinerInfo &DCI) const {
    2977        5869 :   if (N->getValueType(0) != MVT::i64)
    2978        5470 :     return SDValue();
    2979             : 
    2980         399 :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    2981             :   if (!RHS)
    2982          20 :     return SDValue();
    2983             : 
    2984         379 :   SelectionDAG &DAG = DCI.DAG;
    2985             :   SDLoc SL(N);
    2986         758 :   unsigned RHSVal = RHS->getZExtValue();
    2987             : 
    2988             :   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
    2989         379 :   if (RHSVal == 32) {
    2990          12 :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    2991             :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    2992          12 :                                    DAG.getConstant(31, SL, MVT::i32));
    2993             : 
    2994          12 :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
    2995           6 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    2996             :   }
    2997             : 
    2998             :   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
    2999         373 :   if (RHSVal == 63) {
    3000         252 :     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
    3001             :     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
    3002         252 :                                    DAG.getConstant(31, SL, MVT::i32));
    3003         252 :     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
    3004         126 :     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
    3005             :   }
    3006             : 
    3007         247 :   return SDValue();
    3008             : }
    3009             : 
    3010       60793 : SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
    3011             :                                                 DAGCombinerInfo &DCI) const {
    3012       60793 :   if (N->getValueType(0) != MVT::i64)
    3013       51361 :     return SDValue();
    3014             : 
    3015        9432 :   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    3016             :   if (!RHS)
    3017         392 :     return SDValue();
    3018             : 
    3019       18080 :   unsigned ShiftAmt = RHS->getZExtValue();
    3020        9040 :   if (ShiftAmt < 32)
    3021         163 :     return SDValue();
    3022             : 
    3023             :   // srl i64:x, C for C >= 32
    3024             :   // =>
    3025             :   //   build_pair (srl hi_32(x), C - 32), 0
    3026             : 
    3027        8877 :   SelectionDAG &DAG = DCI.DAG;
    3028             :   SDLoc SL(N);
    3029             : 
    3030        8877 :   SDValue One = DAG.getConstant(1, SL, MVT::i32);
    3031        8877 :   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
    3032             : 
    3033       17754 :   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
    3034             :   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
    3035        8877 :                            VecOp, One);
    3036             : 
    3037        8877 :   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
    3038        8877 :   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
    3039             : 
    3040       17754 :   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
    3041             : 
    3042        8877 :   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
    3043             : }
    3044             : 
    3045       52595 : SDValue AMDGPUTargetLowering::performTruncateCombine(
    3046             :   SDNode *N, DAGCombinerInfo &DCI) const {
    3047             :   SDLoc SL(N);
    3048       52595 :   SelectionDAG &DAG = DCI.DAG;
    3049      105190 :   EVT VT = N->getValueType(0);
    3050       52595 :   SDValue Src = N->getOperand(0);
    3051             : 
    3052             :   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
    3053       52595 :   if (Src.getOpcode() == ISD::BITCAST) {
    3054        5675 :     SDValue Vec = Src.getOperand(0);
    3055        5675 :     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
    3056        2070 :       SDValue Elt0 = Vec.getOperand(0);
    3057        2070 :       EVT EltVT = Elt0.getValueType();
    3058        2070 :       if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
    3059        1940 :         if (EltVT.isFloatingPoint()) {
    3060          12 :           Elt0 = DAG.getNode(ISD::BITCAST, SL,
    3061          24 :                              EltVT.changeTypeToInteger(), Elt0);
    3062             :         }
    3063             : 
    3064        1940 :         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
    3065             :       }
    3066             :     }
    3067             :   }
    3068             : 
    3069             :   // Equivalent of above for accessing the high element of a vector as an
    3070             :   // integer operation.
    3071             :   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
    3072       50655 :   if (Src.getOpcode() == ISD::SRL) {
    3073       26365 :     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
    3074      105192 :       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
    3075             :         SDValue BV = stripBitcast(Src.getOperand(0));
    3076       20787 :         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
    3077       20787 :             BV.getValueType().getVectorNumElements() == 2) {
    3078        1225 :           SDValue SrcElt = BV.getOperand(1);
    3079        1225 :           EVT SrcEltVT = SrcElt.getValueType();
    3080        1225 :           if (SrcEltVT.isFloatingPoint()) {
    3081           6 :             SrcElt = DAG.getNode(ISD::BITCAST, SL,
    3082          12 :                                  SrcEltVT.changeTypeToInteger(), SrcElt);
    3083             :           }
    3084             : 
    3085        1225 :           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
    3086             :         }
    3087             :       }
    3088             :     }
    3089             :   }
    3090             : 
    3091             :   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
    3092             :   //
    3093             :   // i16 (trunc (srl i64:x, K)), K <= 16 ->
    3094             :   //     i16 (trunc (srl (i32 (trunc x), K)))
    3095       49430 :   if (VT.getScalarSizeInBits() < 32) {
    3096       15311 :     EVT SrcVT = Src.getValueType();
    3097       19873 :     if (SrcVT.getScalarSizeInBits() > 32 &&
    3098        1583 :         (Src.getOpcode() == ISD::SRL ||
    3099        1527 :          Src.getOpcode() == ISD::SRA ||
    3100             :          Src.getOpcode() == ISD::SHL)) {
    3101        3041 :       SDValue Amt = Src.getOperand(1);
    3102        2388 :       KnownBits Known;
    3103        3041 :       DAG.computeKnownBits(Amt, Known);
    3104             :       unsigned Size = VT.getScalarSizeInBits();
    3105        8453 :       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
    3106        2389 :           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
    3107             :         EVT MidVT = VT.isVector() ?
    3108           1 :           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    3109        1307 :                            VT.getVectorNumElements()) : MVT::i32;
    3110             : 
    3111        1306 :         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
    3112             :         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
    3113         653 :                                     Src.getOperand(0));
    3114         653 :         DCI.AddToWorklist(Trunc.getNode());
    3115             : 
    3116         653 :         if (Amt.getValueType() != NewShiftVT) {
    3117           1 :           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
    3118           1 :           DCI.AddToWorklist(Amt.getNode());
    3119             :         }
    3120             : 
    3121             :         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
    3122         653 :                                           Trunc, Amt);
    3123         653 :         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
    3124             :       }
    3125             :     }
    3126             :   }
    3127             : 
    3128       48777 :   return SDValue();
    3129             : }
    3130             : 
    3131             : // We need to specifically handle i64 mul here to avoid unnecessary conversion
    3132             : // instructions. If we only match on the legalized i64 mul expansion,
    3133             : // SimplifyDemandedBits will be unable to remove them because there will be
    3134             : // multiple uses due to the separate mul + mulh[su].
    3135         424 : static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
    3136             :                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
    3137         424 :   if (Size <= 32) {
    3138         390 :     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3139         390 :     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
    3140             :   }
    3141             : 
    3142             :   // Because we want to eliminate extension instructions before the
    3143             :   // operation, we need to create a single user here (i.e. not the separate
    3144             :   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
    3145             : 
    3146          34 :   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
    3147             : 
    3148             :   SDValue Mul = DAG.getNode(MulOpc, SL,
    3149          34 :                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
    3150             : 
    3151             :   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
    3152          34 :                      Mul.getValue(0), Mul.getValue(1));
    3153             : }
    3154             : 
    3155        5124 : SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
    3156             :                                                 DAGCombinerInfo &DCI) const {
    3157       10248 :   EVT VT = N->getValueType(0);
    3158             : 
    3159        5124 :   unsigned Size = VT.getSizeInBits();
    3160        5124 :   if (VT.isVector() || Size > 64)
    3161         154 :     return SDValue();
    3162             : 
    3163             :   // There are i16 integer mul/mad.
    3164        9765 :   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
    3165          81 :     return SDValue();
    3166             : 
    3167        4889 :   SelectionDAG &DAG = DCI.DAG;
    3168             :   SDLoc DL(N);
    3169             : 
    3170        4889 :   SDValue N0 = N->getOperand(0);
    3171        4889 :   SDValue N1 = N->getOperand(1);
    3172             : 
    3173             :   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
    3174             :   // in the source into any_extends if the result of the mul is truncated. Since
    3175             :   // we can assume the high bits are whatever we want, use the underlying value
    3176             :   // to avoid the unknown high bits from interfering.
    3177        4889 :   if (N0.getOpcode() == ISD::ANY_EXTEND)
    3178          28 :     N0 = N0.getOperand(0);
    3179             : 
    3180        4889 :   if (N1.getOpcode() == ISD::ANY_EXTEND)
    3181          20 :     N1 = N1.getOperand(0);
    3182             : 
    3183        4889 :   SDValue Mul;
    3184             : 
    3185       15137 :   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
    3186         395 :     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3187         395 :     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3188         395 :     Mul = getMul24(DAG, DL, N0, N1, Size, false);
    3189        8613 :   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
    3190          29 :     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3191          29 :     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3192          29 :     Mul = getMul24(DAG, DL, N0, N1, Size, true);
    3193             :   } else {
    3194        4465 :     return SDValue();
    3195             :   }
    3196             : 
    3197             :   // We need to use sext even for MUL_U24, because MUL_U24 is used
    3198             :   // for signed multiply of 8 and 16-bit types.
    3199         424 :   return DAG.getSExtOrTrunc(Mul, DL, VT);
    3200             : }
    3201             : 
    3202         100 : SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
    3203             :                                                   DAGCombinerInfo &DCI) const {
    3204         200 :   EVT VT = N->getValueType(0);
    3205             : 
    3206         172 :   if (!Subtarget->hasMulI24() || VT.isVector())
    3207          28 :     return SDValue();
    3208             : 
    3209          72 :   SelectionDAG &DAG = DCI.DAG;
    3210             :   SDLoc DL(N);
    3211             : 
    3212          72 :   SDValue N0 = N->getOperand(0);
    3213          72 :   SDValue N1 = N->getOperand(1);
    3214             : 
    3215          72 :   if (!isI24(N0, DAG) || !isI24(N1, DAG))
    3216          72 :     return SDValue();
    3217             : 
    3218           0 :   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
    3219           0 :   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
    3220             : 
    3221           0 :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
    3222           0 :   DCI.AddToWorklist(Mulhi.getNode());
    3223           0 :   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
    3224             : }
    3225             : 
    3226        3330 : SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
    3227             :                                                   DAGCombinerInfo &DCI) const {
    3228        6660 :   EVT VT = N->getValueType(0);
    3229             : 
    3230        9990 :   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
    3231           0 :     return SDValue();
    3232             : 
    3233        3330 :   SelectionDAG &DAG = DCI.DAG;
    3234             :   SDLoc DL(N);
    3235             : 
    3236        3330 :   SDValue N0 = N->getOperand(0);
    3237        3330 :   SDValue N1 = N->getOperand(1);
    3238             : 
    3239        3330 :   if (!isU24(N0, DAG) || !isU24(N1, DAG))
    3240        3330 :     return SDValue();
    3241             : 
    3242           0 :   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
    3243           0 :   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
    3244             : 
    3245           0 :   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
    3246           0 :   DCI.AddToWorklist(Mulhi.getNode());
    3247           0 :   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
    3248             : }
    3249             : 
    3250         122 : SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
    3251             :   SDNode *N, DAGCombinerInfo &DCI) const {
    3252         122 :   SelectionDAG &DAG = DCI.DAG;
    3253             : 
    3254             :   // Simplify demanded bits before splitting into multiple users.
    3255         122 :   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
    3256          88 :     return SDValue();
    3257             : 
    3258          34 :   SDValue N0 = N->getOperand(0);
    3259          34 :   SDValue N1 = N->getOperand(1);
    3260             : 
    3261          34 :   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
    3262             : 
    3263          34 :   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
    3264          34 :   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
    3265             : 
    3266             :   SDLoc SL(N);
    3267             : 
    3268          34 :   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
    3269          34 :   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
    3270          68 :   return DAG.getMergeValues({ MulLo, MulHi }, SL);
    3271             : }
    3272             : 
    3273          25 : static bool isNegativeOne(SDValue Val) {
    3274             :   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
    3275          25 :     return C->isAllOnesValue();
    3276             :   return false;
    3277             : }
    3278             : 
    3279          21 : SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
    3280             :                                           SDValue Op,
    3281             :                                           const SDLoc &DL,
    3282             :                                           unsigned Opc) const {
    3283          21 :   EVT VT = Op.getValueType();
    3284          21 :   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
    3285           3 :   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
    3286             :                               LegalVT != MVT::i16))
    3287           0 :     return SDValue();
    3288             : 
    3289             :   if (VT != MVT::i32)
    3290          11 :     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
    3291             : 
    3292          21 :   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
    3293             :   if (VT != MVT::i32)
    3294          11 :     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
    3295             : 
    3296          21 :   return FFBX;
    3297             : }
    3298             : 
    3299             : // The native instructions return -1 on 0 input. Optimize out a select that
    3300             : // produces -1 on 0.
    3301             : //
    3302             : // TODO: If zero is not undef, we could also do this if the output is compared
    3303             : // against the bitwidth.
    3304             : //
    3305             : // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
    3306        8503 : SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
    3307             :                                                  SDValue LHS, SDValue RHS,
    3308             :                                                  DAGCombinerInfo &DCI) const {
    3309             :   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
    3310       11374 :   if (!CmpRhs || !CmpRhs->isNullValue())
    3311        4293 :     return SDValue();
    3312             : 
    3313        4210 :   SelectionDAG &DAG = DCI.DAG;
    3314        4210 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
    3315        4210 :   SDValue CmpLHS = Cond.getOperand(0);
    3316             : 
    3317        4210 :   unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
    3318             :                                            AMDGPUISD::FFBH_U32;
    3319             : 
    3320             :   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
    3321             :   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
    3322        1989 :   if (CCOpcode == ISD::SETEQ &&
    3323        1987 :       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3324        4212 :       RHS.getOperand(0) == CmpLHS &&
    3325           2 :       isNegativeOne(LHS)) {
    3326           2 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3327             :   }
    3328             : 
    3329             :   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
    3330             :   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
    3331         854 :   if (CCOpcode == ISD::SETNE &&
    3332         831 :       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
    3333        4231 :       LHS.getOperand(0) == CmpLHS &&
    3334          23 :       isNegativeOne(RHS)) {
    3335          19 :     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
    3336             :   }
    3337             : 
    3338        4189 :   return SDValue();
    3339             : }
    3340             : 
    3341          24 : static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
    3342             :                                          unsigned Op,
    3343             :                                          const SDLoc &SL,
    3344             :                                          SDValue Cond,
    3345             :                                          SDValue N1,
    3346             :                                          SDValue N2) {
    3347          24 :   SelectionDAG &DAG = DCI.DAG;
    3348          48 :   EVT VT = N1.getValueType();
    3349             : 
    3350             :   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
    3351          48 :                                   N1.getOperand(0), N2.getOperand(0));
    3352          24 :   DCI.AddToWorklist(NewSelect.getNode());
    3353          24 :   return DAG.getNode(Op, SL, VT, NewSelect);
    3354             : }
    3355             : 
    3356             : // Pull a free FP operation out of a select so it may fold into uses.
    3357             : //
    3358             : // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
    3359             : // select c, (fneg x), k -> fneg (select c, x, (fneg k))
    3360             : //
    3361             : // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
    3362             : // select c, (fabs x), +k -> fabs (select c, x, k)
    3363       10070 : static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
    3364             :                                     SDValue N) {
    3365       10070 :   SelectionDAG &DAG = DCI.DAG;
    3366       10070 :   SDValue Cond = N.getOperand(0);
    3367       10070 :   SDValue LHS = N.getOperand(1);
    3368       10070 :   SDValue RHS = N.getOperand(2);
    3369             : 
    3370       10070 :   EVT VT = N.getValueType();
    3371       10118 :   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
    3372         136 :       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
    3373             :     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
    3374          48 :                                      SDLoc(N), Cond, LHS, RHS);
    3375             :   }
    3376             : 
    3377             :   bool Inv = false;
    3378       20066 :   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
    3379             :     std::swap(LHS, RHS);
    3380             :     Inv = true;
    3381             :   }
    3382             : 
    3383             :   // TODO: Support vector constants.
    3384             :   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    3385       10046 :   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
    3386             :     SDLoc SL(N);
    3387             :     // If one side is an fneg/fabs and the other is a constant, we can push the
    3388             :     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
    3389          78 :     SDValue NewLHS = LHS.getOperand(0);
    3390          78 :     SDValue NewRHS = RHS;
    3391             : 
    3392             :     // Careful: if the neg can be folded up, don't try to pull it back down.
    3393             :     bool ShouldFoldNeg = true;
    3394             : 
    3395          78 :     if (NewLHS.hasOneUse()) {
    3396             :       unsigned Opc = NewLHS.getOpcode();
    3397          68 :       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
    3398             :         ShouldFoldNeg = false;
    3399          68 :       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
    3400             :         ShouldFoldNeg = false;
    3401             :     }
    3402             : 
    3403          68 :     if (ShouldFoldNeg) {
    3404          68 :       if (LHS.getOpcode() == ISD::FNEG)
    3405          34 :         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3406          68 :       else if (CRHS->isNegative())
    3407          26 :         return SDValue();
    3408             : 
    3409          42 :       if (Inv)
    3410             :         std::swap(NewLHS, NewRHS);
    3411             : 
    3412             :       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
    3413          42 :                                       Cond, NewLHS, NewRHS);
    3414          42 :       DCI.AddToWorklist(NewSelect.getNode());
    3415          42 :       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
    3416             :     }
    3417             :   }
    3418             : 
    3419        9978 :   return SDValue();
    3420             : }
    3421             : 
    3422             : 
    3423       10070 : SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
    3424             :                                                    DAGCombinerInfo &DCI) const {
    3425       10070 :   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
    3426          66 :     return Folded;
    3427             : 
    3428       10004 :   SDValue Cond = N->getOperand(0);
    3429       10004 :   if (Cond.getOpcode() != ISD::SETCC)
    3430         886 :     return SDValue();
    3431             : 
    3432       18236 :   EVT VT = N->getValueType(0);
    3433        9118 :   SDValue LHS = Cond.getOperand(0);
    3434        9118 :   SDValue RHS = Cond.getOperand(1);
    3435        9118 :   SDValue CC = Cond.getOperand(2);
    3436             : 
    3437        9118 :   SDValue True = N->getOperand(1);
    3438        9118 :   SDValue False = N->getOperand(2);
    3439             : 
    3440        9118 :   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
    3441        4132 :     SelectionDAG &DAG = DCI.DAG;
    3442        6713 :     if ((DAG.isConstantValueOfAnyType(True) ||
    3443        5683 :          DAG.isConstantValueOfAnyType(True)) &&
    3444        1804 :         (!DAG.isConstantValueOfAnyType(False) &&
    3445         253 :          !DAG.isConstantValueOfAnyType(False))) {
    3446             :       // Swap cmp + select pair to move constant to false input.
    3447             :       // This will allow using VOPC cndmasks more often.
    3448             :       // select (setcc x, y), k, x -> select (setcc y, x) x, x
    3449             : 
    3450             :       SDLoc SL(N);
    3451         253 :       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
    3452         506 :                                             LHS.getValueType().isInteger());
    3453             : 
    3454         253 :       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
    3455         253 :       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
    3456             :     }
    3457             : 
    3458        1318 :     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
    3459             :       SDValue MinMax
    3460         724 :         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
    3461             :       // Revisit this node so we can catch min3/max3/med3 patterns.
    3462             :       //DCI.AddToWorklist(MinMax.getNode());
    3463         362 :       return MinMax;
    3464             :     }
    3465             :   }
    3466             : 
    3467             :   // There's no reason to not do this if the condition has other uses.
    3468       17006 :   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
    3469             : }
    3470             : 
    3471          63 : static bool isConstantFPZero(SDValue N) {
    3472          63 :   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
    3473         113 :     return C->isZero() && !C->isNegative();
    3474             :   return false;
    3475             : }
    3476             : 
    3477             : static unsigned inverseMinMax(unsigned Opc) {
    3478          46 :   switch (Opc) {
    3479             :   case ISD::FMAXNUM:
    3480             :     return ISD::FMINNUM;
    3481          21 :   case ISD::FMINNUM:
    3482             :     return ISD::FMAXNUM;
    3483           2 :   case AMDGPUISD::FMAX_LEGACY:
    3484             :     return AMDGPUISD::FMIN_LEGACY;
    3485           3 :   case AMDGPUISD::FMIN_LEGACY:
    3486             :     return  AMDGPUISD::FMAX_LEGACY;
    3487           0 :   default:
    3488           0 :     llvm_unreachable("invalid min/max opcode");
    3489             :   }
    3490             : }
    3491             : 
    3492        3162 : SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
    3493             :                                                  DAGCombinerInfo &DCI) const {
    3494        3162 :   SelectionDAG &DAG = DCI.DAG;
    3495        3162 :   SDValue N0 = N->getOperand(0);
    3496        6324 :   EVT VT = N->getValueType(0);
    3497             : 
    3498             :   unsigned Opc = N0.getOpcode();
    3499             : 
    3500             :   // If the input has multiple uses and we can either fold the negate down, or
    3501             :   // the other uses cannot, give up. This both prevents unprofitable
    3502             :   // transformations and infinite loops: we won't repeatedly try to fold around
    3503             :   // a negate that has no 'good' form.
    3504        3162 :   if (N0.hasOneUse()) {
    3505             :     // This may be able to fold into the source, but at a code size cost. Don't
    3506             :     // fold if the fold into the user is free.
    3507        2428 :     if (allUsesHaveSourceMods(N, 0))
    3508         876 :       return SDValue();
    3509             :   } else {
    3510         940 :     if (fnegFoldsIntoOp(Opc) &&
    3511         249 :         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
    3512         185 :       return SDValue();
    3513             :   }
    3514             : 
    3515             :   SDLoc SL(N);
    3516        2101 :   switch (Opc) {
    3517          92 :   case ISD::FADD: {
    3518          77 :     if (!mayIgnoreSignedZero(N0))
    3519          78 :       return SDValue();
    3520             : 
    3521             :     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
    3522          14 :     SDValue LHS = N0.getOperand(0);
    3523          14 :     SDValue RHS = N0.getOperand(1);
    3524             : 
    3525          14 :     if (LHS.getOpcode() != ISD::FNEG)
    3526          10 :       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3527             :     else
    3528           4 :       LHS = LHS.getOperand(0);
    3529             : 
    3530          14 :     if (RHS.getOpcode() != ISD::FNEG)
    3531          12 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3532             :     else
    3533           2 :       RHS = RHS.getOperand(0);
    3534             : 
    3535          14 :     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
    3536          14 :     if (!N0.hasOneUse())
    3537           2 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3538          14 :     return Res;
    3539             :   }
    3540          61 :   case ISD::FMUL:
    3541             :   case AMDGPUISD::FMUL_LEGACY: {
    3542             :     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
    3543             :     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
    3544          61 :     SDValue LHS = N0.getOperand(0);
    3545          61 :     SDValue RHS = N0.getOperand(1);
    3546             : 
    3547          61 :     if (LHS.getOpcode() == ISD::FNEG)
    3548           8 :       LHS = LHS.getOperand(0);
    3549          53 :     else if (RHS.getOpcode() == ISD::FNEG)
    3550           2 :       RHS = RHS.getOperand(0);
    3551             :     else
    3552          51 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3553             : 
    3554          61 :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
    3555          61 :     if (!N0.hasOneUse())
    3556           5 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3557          61 :     return Res;
    3558             :   }
    3559          47 :   case ISD::FMA:
    3560             :   case ISD::FMAD: {
    3561          31 :     if (!mayIgnoreSignedZero(N0))
    3562          35 :       return SDValue();
    3563             : 
    3564             :     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
    3565          12 :     SDValue LHS = N0.getOperand(0);
    3566          12 :     SDValue MHS = N0.getOperand(1);
    3567          12 :     SDValue RHS = N0.getOperand(2);
    3568             : 
    3569          12 :     if (LHS.getOpcode() == ISD::FNEG)
    3570           5 :       LHS = LHS.getOperand(0);
    3571           7 :     else if (MHS.getOpcode() == ISD::FNEG)
    3572           1 :       MHS = MHS.getOperand(0);
    3573             :     else
    3574           6 :       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
    3575             : 
    3576          12 :     if (RHS.getOpcode() != ISD::FNEG)
    3577          10 :       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3578             :     else
    3579           2 :       RHS = RHS.getOperand(0);
    3580             : 
    3581          12 :     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
    3582          12 :     if (!N0.hasOneUse())
    3583           1 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3584          12 :     return Res;
    3585             :   }
    3586          63 :   case ISD::FMAXNUM:
    3587             :   case ISD::FMINNUM:
    3588             :   case AMDGPUISD::FMAX_LEGACY:
    3589             :   case AMDGPUISD::FMIN_LEGACY: {
    3590             :     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
    3591             :     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
    3592             :     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
    3593             :     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
    3594             : 
    3595          63 :     SDValue LHS = N0.getOperand(0);
    3596          63 :     SDValue RHS = N0.getOperand(1);
    3597             : 
    3598             :     // 0 doesn't have a negated inline immediate.
    3599             :     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
    3600             :     // operations.
    3601          63 :     if (isConstantFPZero(RHS))
    3602          17 :       return SDValue();
    3603             : 
    3604          46 :     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
    3605          46 :     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    3606             :     unsigned Opposite = inverseMinMax(Opc);
    3607             : 
    3608          46 :     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
    3609          46 :     if (!N0.hasOneUse())
    3610           4 :       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
    3611          46 :     return Res;
    3612             :   }
    3613          57 :   case ISD::FP_EXTEND:
    3614             :   case ISD::FTRUNC:
    3615             :   case ISD::FRINT:
    3616             :   case ISD::FNEARBYINT: // XXX - Should fround be handled?
    3617             :   case ISD::FSIN:
    3618             :   case AMDGPUISD::RCP:
    3619             :   case AMDGPUISD::RCP_LEGACY:
    3620             :   case AMDGPUISD::SIN_HW: {
    3621          57 :     SDValue CvtSrc = N0.getOperand(0);
    3622          57 :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3623             :       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
    3624             :       // (fneg (rcp (fneg x))) -> (rcp x)
    3625           8 :       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
    3626             :     }
    3627             : 
    3628          49 :     if (!N0.hasOneUse())
    3629          14 :       return SDValue();
    3630             : 
    3631             :     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
    3632             :     // (fneg (rcp x)) -> (rcp (fneg x))
    3633          35 :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3634          35 :     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
    3635             :   }
    3636           9 :   case ISD::FP_ROUND: {
    3637           9 :     SDValue CvtSrc = N0.getOperand(0);
    3638             : 
    3639           9 :     if (CvtSrc.getOpcode() == ISD::FNEG) {
    3640             :       // (fneg (fp_round (fneg x))) -> (fp_round x)
    3641             :       return DAG.getNode(ISD::FP_ROUND, SL, VT,
    3642           0 :                          CvtSrc.getOperand(0), N0.getOperand(1));
    3643             :     }
    3644             : 
    3645           9 :     if (!N0.hasOneUse())
    3646           4 :       return SDValue();
    3647             : 
    3648             :     // (fneg (fp_round x)) -> (fp_round (fneg x))
    3649           5 :     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
    3650           5 :     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
    3651             :   }
    3652             :   case ISD::FP16_TO_FP: {
    3653             :     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
    3654             :     // f16, but legalization of f16 fneg ends up pulling it out of the source.
    3655             :     // Put the fneg back as a legal source operation that can be matched later.
    3656             :     SDLoc SL(N);
    3657             : 
    3658          23 :     SDValue Src = N0.getOperand(0);
    3659          23 :     EVT SrcVT = Src.getValueType();
    3660             : 
    3661             :     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
    3662             :     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
    3663          23 :                                   DAG.getConstant(0x8000, SL, SrcVT));
    3664          46 :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
    3665             :   }
    3666        1749 :   default:
    3667        1749 :     return SDValue();
    3668             :   }
    3669             : }
    3670             : 
    3671        1876 : SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
    3672             :                                                  DAGCombinerInfo &DCI) const {
    3673        1876 :   SelectionDAG &DAG = DCI.DAG;
    3674        1876 :   SDValue N0 = N->getOperand(0);
    3675             : 
    3676        1876 :   if (!N0.hasOneUse())
    3677         284 :     return SDValue();
    3678             : 
    3679        1592 :   switch (N0.getOpcode()) {
    3680             :   case ISD::FP16_TO_FP: {
    3681             :     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
    3682             :     SDLoc SL(N);
    3683          20 :     SDValue Src = N0.getOperand(0);
    3684          20 :     EVT SrcVT = Src.getValueType();
    3685             : 
    3686             :     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
    3687             :     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
    3688          20 :                                   DAG.getConstant(0x7fff, SL, SrcVT));
    3689          40 :     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
    3690             :   }
    3691        1572 :   default:
    3692        1572 :     return SDValue();
    3693             :   }
    3694             : }
    3695             : 
    3696      700028 : SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
    3697             :                                                 DAGCombinerInfo &DCI) const {
    3698      700028 :   SelectionDAG &DAG = DCI.DAG;
    3699             :   SDLoc DL(N);
    3700             : 
    3701     1400056 :   switch(N->getOpcode()) {
    3702             :   default:
    3703             :     break;
    3704       98628 :   case ISD::BITCAST: {
    3705      197256 :     EVT DestVT = N->getValueType(0);
    3706             : 
    3707             :     // Push casts through vector builds. This helps avoid emitting a large
    3708             :     // number of copies when materializing floating point vector constants.
    3709             :     //
    3710             :     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
    3711             :     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
    3712       98628 :     if (DestVT.isVector()) {
    3713       30627 :       SDValue Src = N->getOperand(0);
    3714       30627 :       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
    3715        4263 :         EVT SrcVT = Src.getValueType();
    3716        4263 :         unsigned NElts = DestVT.getVectorNumElements();
    3717             : 
    3718        4263 :         if (SrcVT.getVectorNumElements() == NElts) {
    3719        2254 :           EVT DestEltVT = DestVT.getVectorElementType();
    3720             : 
    3721             :           SmallVector<SDValue, 8> CastedElts;
    3722             :           SDLoc SL(N);
    3723       13862 :           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
    3724       11608 :             SDValue Elt = Src.getOperand(I);
    3725       11608 :             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
    3726             :           }
    3727             : 
    3728        2254 :           return DAG.getBuildVector(DestVT, SL, CastedElts);
    3729             :         }
    3730             :       }
    3731             :     }
    3732             : 
    3733      140600 :     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
    3734             :       break;
    3735             : 
    3736             :     // Fold bitcasts of constants.
    3737             :     //
    3738             :     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
    3739             :     // TODO: Generalize and move to DAGCombiner
    3740       64052 :     SDValue Src = N->getOperand(0);
    3741             :     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
    3742             :       if (Src.getValueType() == MVT::i64) {
    3743             :         SDLoc SL(N);
    3744         350 :         uint64_t CVal = C->getZExtValue();
    3745             :         return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
    3746             :                            DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3747         700 :                            DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3748             :       }
    3749             :     }
    3750             : 
    3751             :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
    3752          38 :       const APInt &Val = C->getValueAPF().bitcastToAPInt();
    3753             :       SDLoc SL(N);
    3754             :       uint64_t CVal = Val.getZExtValue();
    3755             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    3756             :                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
    3757          57 :                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
    3758             : 
    3759          19 :       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
    3760             :     }
    3761             : 
    3762             :     break;
    3763             :   }
    3764       30257 :   case ISD::SHL: {
    3765       30257 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3766             :       break;
    3767             : 
    3768       18678 :     return performShlCombine(N, DCI);
    3769             :   }
    3770       91640 :   case ISD::SRL: {
    3771       91640 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3772             :       break;
    3773             : 
    3774       60793 :     return performSrlCombine(N, DCI);
    3775             :   }
    3776       11701 :   case ISD::SRA: {
    3777       11701 :     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    3778             :       break;
    3779             : 
    3780        5869 :     return performSraCombine(N, DCI);
    3781             :   }
    3782       52595 :   case ISD::TRUNCATE:
    3783       52595 :     return performTruncateCombine(N, DCI);
    3784        5124 :   case ISD::MUL:
    3785        5124 :     return performMulCombine(N, DCI);
    3786         100 :   case ISD::MULHS:
    3787         100 :     return performMulhsCombine(N, DCI);
    3788        3330 :   case ISD::MULHU:
    3789        3330 :     return performMulhuCombine(N, DCI);
    3790        1504 :   case AMDGPUISD::MUL_I24:
    3791             :   case AMDGPUISD::MUL_U24:
    3792             :   case AMDGPUISD::MULHI_I24:
    3793             :   case AMDGPUISD::MULHI_U24: {
    3794             :     // If the first call to simplify is successfull, then N may end up being
    3795             :     // deleted, so we shouldn't call simplifyI24 again.
    3796        1504 :     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
    3797        1504 :     return SDValue();
    3798             :   }
    3799         122 :   case AMDGPUISD::MUL_LOHI_I24:
    3800             :   case AMDGPUISD::MUL_LOHI_U24:
    3801         122 :     return performMulLoHi24Combine(N, DCI);
    3802       10070 :   case ISD::SELECT:
    3803       10070 :     return performSelectCombine(N, DCI);
    3804        3162 :   case ISD::FNEG:
    3805        3162 :     return performFNegCombine(N, DCI);
    3806        1876 :   case ISD::FABS:
    3807        1876 :     return performFAbsCombine(N, DCI);
    3808         399 :   case AMDGPUISD::BFE_I32:
    3809             :   case AMDGPUISD::BFE_U32: {
    3810             :     assert(!N->getValueType(0).isVector() &&
    3811             :            "Vector handling of BFE not implemented");
    3812         399 :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
    3813             :     if (!Width)
    3814             :       break;
    3815             : 
    3816         774 :     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
    3817         387 :     if (WidthVal == 0)
    3818          22 :       return DAG.getConstant(0, DL, MVT::i32);
    3819             : 
    3820             :     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
    3821             :     if (!Offset)
    3822             :       break;
    3823             : 
    3824         361 :     SDValue BitsFrom = N->getOperand(0);
    3825         722 :     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
    3826             : 
    3827             :     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
    3828             : 
    3829         361 :     if (OffsetVal == 0) {
    3830             :       // This is already sign / zero extended, so try to fold away extra BFEs.
    3831          58 :       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
    3832             : 
    3833          58 :       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
    3834          58 :       if (OpSignBits >= SignBits)
    3835          34 :         return BitsFrom;
    3836             : 
    3837          24 :       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
    3838          24 :       if (Signed) {
    3839             :         // This is a sign_extend_inreg. Replace it to take advantage of existing
    3840             :         // DAG Combines. If not eliminated, we will match back to BFE during
    3841             :         // selection.
    3842             : 
    3843             :         // TODO: The sext_inreg of extended types ends, although we can could
    3844             :         // handle them in a single BFE.
    3845             :         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
    3846          36 :                            DAG.getValueType(SmallVT));
    3847             :       }
    3848             : 
    3849           6 :       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
    3850             :     }
    3851             : 
    3852             :     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
    3853          48 :       if (Signed) {
    3854             :         return constantFoldBFE<int32_t>(DAG,
    3855          24 :                                         CVal->getSExtValue(),
    3856             :                                         OffsetVal,
    3857             :                                         WidthVal,
    3858          24 :                                         DL);
    3859             :       }
    3860             : 
    3861             :       return constantFoldBFE<uint32_t>(DAG,
    3862          24 :                                        CVal->getZExtValue(),
    3863             :                                        OffsetVal,
    3864             :                                        WidthVal,
    3865          24 :                                        DL);
    3866             :     }
    3867             : 
    3868         322 :     if ((OffsetVal + WidthVal) >= 32 &&
    3869         117 :         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
    3870          65 :       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
    3871             :       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
    3872          65 :                          BitsFrom, ShiftVal);
    3873             :     }
    3874             : 
    3875         190 :     if (BitsFrom.hasOneUse()) {
    3876             :       APInt Demanded = APInt::getBitsSet(32,
    3877             :                                          OffsetVal,
    3878             :                                          OffsetVal + WidthVal);
    3879             : 
    3880          48 :       KnownBits Known;
    3881          48 :       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    3882          96 :                                             !DCI.isBeforeLegalizeOps());
    3883          48 :       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    3884          88 :       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
    3885          40 :           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
    3886          14 :         DCI.CommitTargetLoweringOpt(TLO);
    3887             :       }
    3888             :     }
    3889             : 
    3890             :     break;
    3891             :   }
    3892      144086 :   case ISD::LOAD:
    3893      144086 :     return performLoadCombine(N, DCI);
    3894       93951 :   case ISD::STORE:
    3895       93951 :     return performStoreCombine(N, DCI);
    3896         689 :   case AMDGPUISD::RCP: {
    3897         689 :     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
    3898             :       // XXX - Should this flush denormals?
    3899           2 :       const APFloat &Val = CFP->getValueAPF();
    3900           2 :       APFloat One(Val.getSemantics(), "1.0");
    3901           8 :       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
    3902             :     }
    3903             : 
    3904             :     break;
    3905             :   }
    3906       11025 :   case ISD::AssertZext:
    3907             :   case ISD::AssertSext:
    3908       11025 :     return performAssertSZExtCombine(N, DCI);
    3909             :   }
    3910      284925 :   return SDValue();
    3911             : }
    3912             : 
    3913             : //===----------------------------------------------------------------------===//
    3914             : // Helper functions
    3915             : //===----------------------------------------------------------------------===//
    3916             : 
    3917        5187 : SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
    3918             :                                                    const TargetRegisterClass *RC,
    3919             :                                                    unsigned Reg, EVT VT,
    3920             :                                                    const SDLoc &SL,
    3921             :                                                    bool RawReg) const {
    3922        5187 :   MachineFunction &MF = DAG.getMachineFunction();
    3923        5187 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3924             :   unsigned VReg;
    3925             : 
    3926        5187 :   if (!MRI.isLiveIn(Reg)) {
    3927        1652 :     VReg = MRI.createVirtualRegister(RC);
    3928             :     MRI.addLiveIn(Reg, VReg);
    3929             :   } else {
    3930        3535 :     VReg = MRI.getLiveInVirtReg(Reg);
    3931             :   }
    3932             : 
    3933        5187 :   if (RawReg)
    3934         257 :     return DAG.getRegister(VReg, VT);
    3935             : 
    3936        4930 :   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
    3937             : }
    3938             : 
    3939           8 : SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
    3940             :                                                   EVT VT,
    3941             :                                                   const SDLoc &SL,
    3942             :                                                   int64_t Offset) const {
    3943           8 :   MachineFunction &MF = DAG.getMachineFunction();
    3944           8 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    3945             : 
    3946           8 :   int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
    3947           8 :   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
    3948           8 :   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
    3949             : 
    3950             :   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
    3951             :                      MachineMemOperand::MODereferenceable |
    3952           8 :                      MachineMemOperand::MOInvariant);
    3953             : }
    3954             : 
    3955          10 : SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
    3956             :                                                    const SDLoc &SL,
    3957             :                                                    SDValue Chain,
    3958             :                                                    SDValue StackPtr,
    3959             :                                                    SDValue ArgVal,
    3960             :                                                    int64_t Offset) const {
    3961          10 :   MachineFunction &MF = DAG.getMachineFunction();
    3962          10 :   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
    3963             : 
    3964          10 :   SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
    3965             :   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
    3966          10 :                                MachineMemOperand::MODereferenceable);
    3967          10 :   return Store;
    3968             : }
    3969             : 
    3970        3325 : SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
    3971             :                                              const TargetRegisterClass *RC,
    3972             :                                              EVT VT, const SDLoc &SL,
    3973             :                                              const ArgDescriptor &Arg) const {
    3974             :   assert(Arg && "Attempting to load missing argument");
    3975             : 
    3976        3325 :   if (Arg.isRegister())
    3977        3317 :     return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
    3978           8 :   return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
    3979             : }
    3980             : 
    3981          42 : uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
    3982             :     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
    3983          42 :   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
    3984          42 :   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
    3985          42 :   switch (Param) {
    3986          42 :   case GRID_DIM:
    3987          42 :     return ArgOffset;
    3988           0 :   case GRID_OFFSET:
    3989           0 :     return ArgOffset + 4;
    3990             :   }
    3991           0 :   llvm_unreachable("unexpected implicit parameter type");
    3992             : }
    3993             : 
    3994             : #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
    3995             : 
    3996           0 : const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    3997           0 :   switch ((AMDGPUISD::NodeType)Opcode) {
    3998             :   case AMDGPUISD::FIRST_NUMBER: break;
    3999             :   // AMDIL DAG nodes
    4000             :   NODE_NAME_CASE(UMUL);
    4001           0 :   NODE_NAME_CASE(BRANCH_COND);
    4002             : 
    4003             :   // AMDGPU DAG nodes
    4004           0 :   NODE_NAME_CASE(IF)
    4005           0 :   NODE_NAME_CASE(ELSE)
    4006           0 :   NODE_NAME_CASE(LOOP)
    4007           0 :   NODE_NAME_CASE(CALL)
    4008           0 :   NODE_NAME_CASE(TC_RETURN)
    4009           0 :   NODE_NAME_CASE(TRAP)
    4010           0 :   NODE_NAME_CASE(RET_FLAG)
    4011           0 :   NODE_NAME_CASE(RETURN_TO_EPILOG)
    4012           0 :   NODE_NAME_CASE(ENDPGM)
    4013           0 :   NODE_NAME_CASE(DWORDADDR)
    4014           0 :   NODE_NAME_CASE(FRACT)
    4015           0 :   NODE_NAME_CASE(SETCC)
    4016           0 :   NODE_NAME_CASE(SETREG)
    4017           0 :   NODE_NAME_CASE(FMA_W_CHAIN)
    4018           0 :   NODE_NAME_CASE(FMUL_W_CHAIN)
    4019           0 :   NODE_NAME_CASE(CLAMP)
    4020           0 :   NODE_NAME_CASE(COS_HW)
    4021           0 :   NODE_NAME_CASE(SIN_HW)
    4022           0 :   NODE_NAME_CASE(FMAX_LEGACY)
    4023           0 :   NODE_NAME_CASE(FMIN_LEGACY)
    4024           0 :   NODE_NAME_CASE(FMAX3)
    4025           0 :   NODE_NAME_CASE(SMAX3)
    4026           0 :   NODE_NAME_CASE(UMAX3)
    4027           0 :   NODE_NAME_CASE(FMIN3)
    4028           0 :   NODE_NAME_CASE(SMIN3)
    4029           0 :   NODE_NAME_CASE(UMIN3)
    4030           0 :   NODE_NAME_CASE(FMED3)
    4031           0 :   NODE_NAME_CASE(SMED3)
    4032           0 :   NODE_NAME_CASE(UMED3)
    4033           0 :   NODE_NAME_CASE(URECIP)
    4034           0 :   NODE_NAME_CASE(DIV_SCALE)
    4035           0 :   NODE_NAME_CASE(DIV_FMAS)
    4036           0 :   NODE_NAME_CASE(DIV_FIXUP)
    4037           0 :   NODE_NAME_CASE(FMAD_FTZ)
    4038           0 :   NODE_NAME_CASE(TRIG_PREOP)
    4039           0 :   NODE_NAME_CASE(RCP)
    4040           0 :   NODE_NAME_CASE(RSQ)
    4041           0 :   NODE_NAME_CASE(RCP_LEGACY)
    4042           0 :   NODE_NAME_CASE(RSQ_LEGACY)
    4043           0 :   NODE_NAME_CASE(FMUL_LEGACY)
    4044           0 :   NODE_NAME_CASE(RSQ_CLAMP)
    4045           0 :   NODE_NAME_CASE(LDEXP)
    4046           0 :   NODE_NAME_CASE(FP_CLASS)
    4047           0 :   NODE_NAME_CASE(DOT4)
    4048           0 :   NODE_NAME_CASE(CARRY)
    4049           0 :   NODE_NAME_CASE(BORROW)
    4050           0 :   NODE_NAME_CASE(BFE_U32)
    4051           0 :   NODE_NAME_CASE(BFE_I32)
    4052           0 :   NODE_NAME_CASE(BFI)
    4053           0 :   NODE_NAME_CASE(BFM)
    4054           0 :   NODE_NAME_CASE(FFBH_U32)
    4055           0 :   NODE_NAME_CASE(FFBH_I32)
    4056           0 :   NODE_NAME_CASE(FFBL_B32)
    4057           0 :   NODE_NAME_CASE(MUL_U24)
    4058           0 :   NODE_NAME_CASE(MUL_I24)
    4059           0 :   NODE_NAME_CASE(MULHI_U24)
    4060           0 :   NODE_NAME_CASE(MULHI_I24)
    4061           0 :   NODE_NAME_CASE(MUL_LOHI_U24)
    4062           0 :   NODE_NAME_CASE(MUL_LOHI_I24)
    4063           0 :   NODE_NAME_CASE(MAD_U24)
    4064           0 :   NODE_NAME_CASE(MAD_I24)
    4065           0 :   NODE_NAME_CASE(MAD_I64_I32)
    4066           0 :   NODE_NAME_CASE(MAD_U64_U32)
    4067           0 :   NODE_NAME_CASE(PERM)
    4068           0 :   NODE_NAME_CASE(TEXTURE_FETCH)
    4069           0 :   NODE_NAME_CASE(EXPORT)
    4070           0 :   NODE_NAME_CASE(EXPORT_DONE)
    4071           0 :   NODE_NAME_CASE(R600_EXPORT)
    4072           0 :   NODE_NAME_CASE(CONST_ADDRESS)
    4073           0 :   NODE_NAME_CASE(REGISTER_LOAD)
    4074           0 :   NODE_NAME_CASE(REGISTER_STORE)
    4075           0 :   NODE_NAME_CASE(SAMPLE)
    4076           0 :   NODE_NAME_CASE(SAMPLEB)
    4077           0 :   NODE_NAME_CASE(SAMPLED)
    4078           0 :   NODE_NAME_CASE(SAMPLEL)
    4079           0 :   NODE_NAME_CASE(CVT_F32_UBYTE0)
    4080           0 :   NODE_NAME_CASE(CVT_F32_UBYTE1)
    4081           0 :   NODE_NAME_CASE(CVT_F32_UBYTE2)
    4082           0 :   NODE_NAME_CASE(CVT_F32_UBYTE3)
    4083           0 :   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
    4084           0 :   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
    4085           0 :   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
    4086           0 :   NODE_NAME_CASE(CVT_PK_I16_I32)
    4087           0 :   NODE_NAME_CASE(CVT_PK_U16_U32)
    4088           0 :   NODE_NAME_CASE(FP_TO_FP16)
    4089           0 :   NODE_NAME_CASE(FP16_ZEXT)
    4090           0 :   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
    4091           0 :   NODE_NAME_CASE(CONST_DATA_PTR)
    4092           0 :   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
    4093           0 :   NODE_NAME_CASE(KILL)
    4094           0 :   NODE_NAME_CASE(DUMMY_CHAIN)
    4095             :   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
    4096           0 :   NODE_NAME_CASE(INIT_EXEC)
    4097           0 :   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
    4098           0 :   NODE_NAME_CASE(SENDMSG)
    4099           0 :   NODE_NAME_CASE(SENDMSGHALT)
    4100           0 :   NODE_NAME_CASE(INTERP_MOV)
    4101           0 :   NODE_NAME_CASE(INTERP_P1)
    4102           0 :   NODE_NAME_CASE(INTERP_P2)
    4103           0 :   NODE_NAME_CASE(STORE_MSKOR)
    4104           0 :   NODE_NAME_CASE(LOAD_CONSTANT)
    4105           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
    4106           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
    4107           0 :   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
    4108           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
    4109           0 :   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
    4110           0 :   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
    4111           0 :   NODE_NAME_CASE(ATOMIC_INC)
    4112           0 :   NODE_NAME_CASE(ATOMIC_DEC)
    4113           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FADD)
    4114           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
    4115           0 :   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
    4116           0 :   NODE_NAME_CASE(BUFFER_LOAD)
    4117           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
    4118           0 :   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
    4119           0 :   NODE_NAME_CASE(BUFFER_STORE)
    4120           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
    4121           0 :   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
    4122           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
    4123           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
    4124           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
    4125           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
    4126           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
    4127           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
    4128           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
    4129           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
    4130           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
    4131           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
    4132           0 :   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
    4133           0 :   NODE_NAME_CASE(IMAGE_LOAD)
    4134           0 :   NODE_NAME_CASE(IMAGE_LOAD_MIP)
    4135           0 :   NODE_NAME_CASE(IMAGE_STORE)
    4136           0 :   NODE_NAME_CASE(IMAGE_STORE_MIP)
    4137             :   // Basic sample.
    4138           0 :   NODE_NAME_CASE(IMAGE_SAMPLE)
    4139           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CL)
    4140           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D)
    4141           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_CL)
    4142           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_L)
    4143           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B)
    4144           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_CL)
    4145           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_LZ)
    4146           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD)
    4147           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL)
    4148             :   // Sample with comparison.
    4149           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C)
    4150           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CL)
    4151           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D)
    4152           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL)
    4153           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_L)
    4154           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B)
    4155           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL)
    4156           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ)
    4157           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD)
    4158           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL)
    4159             :   // Sample with offsets.
    4160           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_O)
    4161           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CL_O)
    4162           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_O)
    4163           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_D_CL_O)
    4164           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_L_O)
    4165           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_O)
    4166           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_B_CL_O)
    4167           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_LZ_O)
    4168           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_O)
    4169           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL_O)
    4170             :   // Sample with comparison and offsets.
    4171           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_O)
    4172           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CL_O)
    4173           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_O)
    4174           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL_O)
    4175           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_L_O)
    4176           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_O)
    4177           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL_O)
    4178           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ_O)
    4179           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_O)
    4180           0 :   NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL_O)
    4181             :   // Basic gather4.
    4182           0 :   NODE_NAME_CASE(IMAGE_GATHER4)
    4183           0 :   NODE_NAME_CASE(IMAGE_GATHER4_CL)
    4184           0 :   NODE_NAME_CASE(IMAGE_GATHER4_L)
    4185           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B)
    4186           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_CL)
    4187           0 :   NODE_NAME_CASE(IMAGE_GATHER4_LZ)
    4188             :   // Gather4 with comparison.
    4189           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C)
    4190           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_CL)
    4191           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_L)
    4192           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B)
    4193           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL)
    4194           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_LZ)
    4195             :   // Gather4 with offsets.
    4196           0 :   NODE_NAME_CASE(IMAGE_GATHER4_O)
    4197           0 :   NODE_NAME_CASE(IMAGE_GATHER4_CL_O)
    4198           0 :   NODE_NAME_CASE(IMAGE_GATHER4_L_O)
    4199           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_O)
    4200           0 :   NODE_NAME_CASE(IMAGE_GATHER4_B_CL_O)
    4201           0 :   NODE_NAME_CASE(IMAGE_GATHER4_LZ_O)
    4202             :   // Gather4 with comparison and offsets.
    4203           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_O)
    4204           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_CL_O)
    4205           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_L_O)
    4206           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_O)
    4207           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL_O)
    4208           0 :   NODE_NAME_CASE(IMAGE_GATHER4_C_LZ_O)
    4209             : 
    4210             :   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
    4211             :   }
    4212           0 :   return nullptr;
    4213             : }
    4214             : 
    4215           8 : SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
    4216             :                                               SelectionDAG &DAG, int Enabled,
    4217             :                                               int &RefinementSteps,
    4218             :                                               bool &UseOneConstNR,
    4219             :                                               bool Reciprocal) const {
    4220           8 :   EVT VT = Operand.getValueType();
    4221             : 
    4222             :   if (VT == MVT::f32) {
    4223           5 :     RefinementSteps = 0;
    4224          10 :     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
    4225             :   }
    4226             : 
    4227             :   // TODO: There is also f64 rsq instruction, but the documentation is less
    4228             :   // clear on its precision.
    4229             : 
    4230           3 :   return SDValue();
    4231             : }
    4232             : 
    4233         158 : SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
    4234             :                                                SelectionDAG &DAG, int Enabled,
    4235             :                                                int &RefinementSteps) const {
    4236         158 :   EVT VT = Operand.getValueType();
    4237             : 
    4238             :   if (VT == MVT::f32) {
    4239             :     // Reciprocal, < 1 ulp error.
    4240             :     //
    4241             :     // This reciprocal approximation converges to < 0.5 ulp error with one
    4242             :     // newton rhapson performed with two fused multiple adds (FMAs).
    4243             : 
    4244          97 :     RefinementSteps = 0;
    4245         194 :     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
    4246             :   }
    4247             : 
    4248             :   // TODO: There is also f64 rcp instruction, but the documentation is less
    4249             :   // clear on its precision.
    4250             : 
    4251          61 :   return SDValue();
    4252             : }
    4253             : 
    4254      135353 : void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
    4255             :     const SDValue Op, KnownBits &Known,
    4256             :     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
    4257             : 
    4258             :   Known.resetAll(); // Don't know anything.
    4259             : 
    4260             :   unsigned Opc = Op.getOpcode();
    4261             : 
    4262      135353 :   switch (Opc) {
    4263             :   default:
    4264             :     break;
    4265        8428 :   case AMDGPUISD::CARRY:
    4266             :   case AMDGPUISD::BORROW: {
    4267       16856 :     Known.Zero = APInt::getHighBitsSet(32, 31);
    4268        8428 :     break;
    4269             :   }
    4270             : 
    4271         476 :   case AMDGPUISD::BFE_I32:
    4272             :   case AMDGPUISD::BFE_U32: {
    4273             :     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4274             :     if (!CWidth)
    4275             :       return;
    4276             : 
    4277         952 :     uint32_t Width = CWidth->getZExtValue() & 0x1f;
    4278             : 
    4279         476 :     if (Opc == AMDGPUISD::BFE_U32)
    4280         888 :       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
    4281             : 
    4282             :     break;
    4283             :   }
    4284             :   case AMDGPUISD::FP_TO_FP16:
    4285             :   case AMDGPUISD::FP16_ZEXT: {
    4286             :     unsigned BitWidth = Known.getBitWidth();
    4287             : 
    4288             :     // High bits are zero.
    4289        4408 :     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
    4290        2204 :     break;
    4291             :   }
    4292             :   case AMDGPUISD::MUL_U24:
    4293             :   case AMDGPUISD::MUL_I24: {
    4294        8764 :     KnownBits LHSKnown, RHSKnown;
    4295       17528 :     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
    4296        8764 :     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
    4297             : 
    4298        8764 :     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
    4299        8764 :                       RHSKnown.countMinTrailingZeros();
    4300       26292 :     Known.Zero.setLowBits(std::min(TrailZ, 32u));
    4301             : 
    4302       17528 :     unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
    4303       17528 :     unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
    4304       17528 :     unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
    4305        8764 :     if (MaxValBits >= 32)
    4306             :       break;
    4307             :     bool Negative = false;
    4308        4965 :     if (Opc == AMDGPUISD::MUL_I24) {
    4309          10 :       bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
    4310          10 :       bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
    4311          10 :       bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
    4312          10 :       bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
    4313           5 :       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
    4314             :         break;
    4315           5 :       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
    4316             :     }
    4317             :     if (Negative)
    4318           0 :       Known.One.setHighBits(32 - MaxValBits);
    4319             :     else
    4320        4965 :       Known.Zero.setHighBits(32 - MaxValBits);
    4321             :     break;
    4322             :   }
    4323          20 :   case AMDGPUISD::PERM: {
    4324             :     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4325             :     if (!CMask)
    4326           0 :       return;
    4327             : 
    4328          20 :     KnownBits LHSKnown, RHSKnown;
    4329          40 :     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
    4330          20 :     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
    4331          40 :     unsigned Sel = CMask->getZExtValue();
    4332             : 
    4333         180 :     for (unsigned I = 0; I < 32; I += 8) {
    4334          80 :       unsigned SelBits = Sel & 0xff;
    4335          80 :       if (SelBits < 4) {
    4336          20 :         SelBits *= 8;
    4337          20 :         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
    4338          20 :         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
    4339          60 :       } else if (SelBits < 7) {
    4340          34 :         SelBits = (SelBits & 3) * 8;
    4341          34 :         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
    4342          34 :         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
    4343          26 :       } else if (SelBits == 0x0c) {
    4344          24 :         Known.Zero |= 0xff << I;
    4345           2 :       } else if (SelBits > 0x0c) {
    4346           0 :         Known.One |= 0xff << I;
    4347             :       }
    4348          80 :       Sel >>= 8;
    4349             :     }
    4350             :     break;
    4351             :   }
    4352       71607 :   case ISD::INTRINSIC_WO_CHAIN: {
    4353      143214 :     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4354       71607 :     switch (IID) {
    4355        5544 :     case Intrinsic::amdgcn_mbcnt_lo:
    4356             :     case Intrinsic::amdgcn_mbcnt_hi: {
    4357             :       // These return at most the wavefront size - 1.
    4358       11088 :       unsigned Size = Op.getValueType().getSizeInBits();
    4359       11088 :       Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
    4360             :       break;
    4361             :     }
    4362             :     default:
    4363             :       break;
    4364             :     }
    4365             :   }
    4366             :   }
    4367             : }
    4368             : 
    4369        1699 : unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
    4370             :     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
    4371             :     unsigned Depth) const {
    4372        1699 :   switch (Op.getOpcode()) {
    4373           2 :   case AMDGPUISD::BFE_I32: {
    4374             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4375             :     if (!Width)
    4376             :       return 1;
    4377             : 
    4378           4 :     unsigned SignBits = 32 - Width->getZExtValue() + 1;
    4379           2 :     if (!isNullConstant(Op.getOperand(1)))
    4380             :       return SignBits;
    4381             : 
    4382             :     // TODO: Could probably figure something out with non-0 offsets.
    4383           0 :     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
    4384           0 :     return std::max(SignBits, Op0SignBits);
    4385             :   }
    4386             : 
    4387           0 :   case AMDGPUISD::BFE_U32: {
    4388             :     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
    4389           0 :     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
    4390             :   }
    4391             : 
    4392             :   case AMDGPUISD::CARRY:
    4393             :   case AMDGPUISD::BORROW:
    4394             :     return 31;
    4395           1 :   case AMDGPUISD::FP_TO_FP16:
    4396             :   case AMDGPUISD::FP16_ZEXT:
    4397           1 :     return 16;
    4398         998 :   default:
    4399         998 :     return 1;
    4400             :   }
    4401             : }

Generated by: LCOV version 1.13