LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 2580 2744 94.0 %
Date: 2018-02-18 03:11:45 Functions: 146 146 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Custom DAG lowering for SI
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifdef _MSC_VER
      16             : // Provide M_PI.
      17             : #define _USE_MATH_DEFINES
      18             : #endif
      19             : 
      20             : #include "SIISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUIntrinsicInfo.h"
      23             : #include "AMDGPUSubtarget.h"
      24             : #include "AMDGPUTargetMachine.h"
      25             : #include "SIDefines.h"
      26             : #include "SIInstrInfo.h"
      27             : #include "SIMachineFunctionInfo.h"
      28             : #include "SIRegisterInfo.h"
      29             : #include "Utils/AMDGPUBaseInfo.h"
      30             : #include "llvm/ADT/APFloat.h"
      31             : #include "llvm/ADT/APInt.h"
      32             : #include "llvm/ADT/ArrayRef.h"
      33             : #include "llvm/ADT/BitVector.h"
      34             : #include "llvm/ADT/SmallVector.h"
      35             : #include "llvm/ADT/Statistic.h"
      36             : #include "llvm/ADT/StringRef.h"
      37             : #include "llvm/ADT/StringSwitch.h"
      38             : #include "llvm/ADT/Twine.h"
      39             : #include "llvm/CodeGen/Analysis.h"
      40             : #include "llvm/CodeGen/CallingConvLower.h"
      41             : #include "llvm/CodeGen/DAGCombine.h"
      42             : #include "llvm/CodeGen/ISDOpcodes.h"
      43             : #include "llvm/CodeGen/MachineBasicBlock.h"
      44             : #include "llvm/CodeGen/MachineFrameInfo.h"
      45             : #include "llvm/CodeGen/MachineFunction.h"
      46             : #include "llvm/CodeGen/MachineInstr.h"
      47             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      48             : #include "llvm/CodeGen/MachineMemOperand.h"
      49             : #include "llvm/CodeGen/MachineModuleInfo.h"
      50             : #include "llvm/CodeGen/MachineOperand.h"
      51             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      52             : #include "llvm/CodeGen/MachineValueType.h"
      53             : #include "llvm/CodeGen/SelectionDAG.h"
      54             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      55             : #include "llvm/CodeGen/TargetCallingConv.h"
      56             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      57             : #include "llvm/CodeGen/ValueTypes.h"
      58             : #include "llvm/IR/Constants.h"
      59             : #include "llvm/IR/DataLayout.h"
      60             : #include "llvm/IR/DebugLoc.h"
      61             : #include "llvm/IR/DerivedTypes.h"
      62             : #include "llvm/IR/DiagnosticInfo.h"
      63             : #include "llvm/IR/Function.h"
      64             : #include "llvm/IR/GlobalValue.h"
      65             : #include "llvm/IR/InstrTypes.h"
      66             : #include "llvm/IR/Instruction.h"
      67             : #include "llvm/IR/Instructions.h"
      68             : #include "llvm/IR/IntrinsicInst.h"
      69             : #include "llvm/IR/Type.h"
      70             : #include "llvm/Support/Casting.h"
      71             : #include "llvm/Support/CodeGen.h"
      72             : #include "llvm/Support/CommandLine.h"
      73             : #include "llvm/Support/Compiler.h"
      74             : #include "llvm/Support/ErrorHandling.h"
      75             : #include "llvm/Support/KnownBits.h"
      76             : #include "llvm/Support/MathExtras.h"
      77             : #include "llvm/Target/TargetOptions.h"
      78             : #include <cassert>
      79             : #include <cmath>
      80             : #include <cstdint>
      81             : #include <iterator>
      82             : #include <tuple>
      83             : #include <utility>
      84             : #include <vector>
      85             : 
      86             : using namespace llvm;
      87             : 
      88             : #define DEBUG_TYPE "si-lower"
      89             : 
      90             : STATISTIC(NumTailCalls, "Number of tail calls");
      91             : 
      92       97317 : static cl::opt<bool> EnableVGPRIndexMode(
      93             :   "amdgpu-vgpr-index-mode",
      94       97317 :   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
      95      291951 :   cl::init(false));
      96             : 
      97       97317 : static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
      98             :   "amdgpu-frame-index-zero-bits",
      99       97317 :   cl::desc("High bits of frame index assumed to be zero"),
     100      194634 :   cl::init(5),
     101      291951 :   cl::ReallyHidden);
     102             : 
     103             : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
     104             :   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
     105         358 :   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
     106         402 :     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
     107             :       return AMDGPU::SGPR0 + Reg;
     108             :     }
     109             :   }
     110           0 :   llvm_unreachable("Cannot allocate sgpr");
     111             : }
     112             : 
     113        2046 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
     114        2046 :                                    const SISubtarget &STI)
     115        2046 :     : AMDGPUTargetLowering(TM, STI) {
     116             :   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
     117             :   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
     118             : 
     119             :   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
     120             :   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
     121             : 
     122             :   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
     123             :   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
     124             :   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
     125             : 
     126             :   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
     127             :   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
     128             : 
     129             :   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
     130             :   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     131             : 
     132             :   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
     133             :   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     134             : 
     135             :   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
     136             :   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     137             : 
     138        2046 :   if (Subtarget->has16BitInsts()) {
     139             :     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     140             :     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
     141             :   }
     142             : 
     143        2046 :   if (Subtarget->hasVOP3PInsts()) {
     144             :     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     145             :     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
     146             :   }
     147             : 
     148        4092 :   computeRegisterProperties(STI.getRegisterInfo());
     149             : 
     150             :   // We need to custom lower vector stores from local memory
     151             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     152             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     153             :   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
     154             :   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
     155             :   setOperationAction(ISD::LOAD, MVT::i1, Custom);
     156             : 
     157             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     158             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     159             :   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
     160             :   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
     161             :   setOperationAction(ISD::STORE, MVT::i1, Custom);
     162             : 
     163             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     164             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
     165             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
     166             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
     167             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
     168             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
     169             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
     170             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
     171             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
     172             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
     173             : 
     174             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     175             :   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     176             :   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
     177             : 
     178             :   setOperationAction(ISD::SELECT, MVT::i1, Promote);
     179             :   setOperationAction(ISD::SELECT, MVT::i64, Custom);
     180             :   setOperationAction(ISD::SELECT, MVT::f64, Promote);
     181             :   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
     182             : 
     183             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     184             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
     185             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     186             :   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     187             :   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
     188             : 
     189             :   setOperationAction(ISD::SETCC, MVT::i1, Promote);
     190             :   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     191             :   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     192             :   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
     193             : 
     194             :   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
     195             :   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
     196             : 
     197             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
     198             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
     199             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
     200             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
     201             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
     202             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
     203             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
     204             : 
     205             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     206             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
     207             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
     208             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
     209             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
     210             : 
     211             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
     212             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
     213             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
     214             : 
     215             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     216             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
     217             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
     218             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
     219             : 
     220             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     221             :   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
     222             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     223             :   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
     224             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     225             :   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     226             : 
     227             :   setOperationAction(ISD::UADDO, MVT::i32, Legal);
     228             :   setOperationAction(ISD::USUBO, MVT::i32, Legal);
     229             : 
     230             :   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
     231             :   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
     232             : 
     233             : #if 0
     234             :   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
     235             :   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
     236             : #endif
     237             : 
     238             :   //setOperationAction(ISD::ADDC, MVT::i64, Expand);
     239             :   //setOperationAction(ISD::SUBC, MVT::i64, Expand);
     240             : 
     241             :   // We only support LOAD/STORE and vector manipulation ops for vectors
     242             :   // with > 4 elements.
     243       12276 :   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
     244       14322 :         MVT::v2i64, MVT::v2f64}) {
     245     6371244 :     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     246     3179484 :       switch (Op) {
     247             :       case ISD::LOAD:
     248             :       case ISD::STORE:
     249             :       case ISD::BUILD_VECTOR:
     250             :       case ISD::BITCAST:
     251             :       case ISD::EXTRACT_VECTOR_ELT:
     252             :       case ISD::INSERT_VECTOR_ELT:
     253             :       case ISD::INSERT_SUBVECTOR:
     254             :       case ISD::EXTRACT_SUBVECTOR:
     255             :       case ISD::SCALAR_TO_VECTOR:
     256             :         break;
     257       12276 :       case ISD::CONCAT_VECTORS:
     258             :         setOperationAction(Op, VT, Custom);
     259             :         break;
     260     3056724 :       default:
     261             :         setOperationAction(Op, VT, Expand);
     262             :         break;
     263             :       }
     264             :     }
     265             :   }
     266             : 
     267             :   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
     268             :   // is expanded to avoid having two separate loops in case the index is a VGPR.
     269             : 
     270             :   // Most operations are naturally 32-bit vector operations. We only support
     271             :   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
     272       10230 :   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
     273             :     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
     274             :     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     275             : 
     276             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     277             :     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
     278             : 
     279             :     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
     280             :     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
     281             : 
     282             :     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
     283             :     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
     284             :   }
     285             : 
     286             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     287             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     288             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     289             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     290             : 
     291             :   // Avoid stack access for these.
     292             :   // TODO: Generalize to more vector types.
     293             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
     294             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
     295             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     296             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     297             : 
     298             :   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
     299             :   // and output demarshalling
     300             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
     301             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
     302             : 
     303             :   // We can't return success/failure, only the old value,
     304             :   // let LLVM add the comparison
     305             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
     306             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
     307             : 
     308        2046 :   if (getSubtarget()->hasFlatAddressSpace()) {
     309             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     310             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
     311             :   }
     312             : 
     313             :   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
     314             :   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
     315             : 
     316             :   // On SI this is s_memtime and s_memrealtime on VI.
     317             :   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
     318             :   setOperationAction(ISD::TRAP, MVT::Other, Custom);
     319             :   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
     320             : 
     321             :   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     322             :   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     323             : 
     324        2046 :   if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     325             :     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     326             :     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     327             :     setOperationAction(ISD::FRINT, MVT::f64, Legal);
     328             :   }
     329             : 
     330             :   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     331             : 
     332             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     333             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     334             :   setOperationAction(ISD::FDIV, MVT::f32, Custom);
     335             :   setOperationAction(ISD::FDIV, MVT::f64, Custom);
     336             : 
     337        2046 :   if (Subtarget->has16BitInsts()) {
     338             :     setOperationAction(ISD::Constant, MVT::i16, Legal);
     339             : 
     340             :     setOperationAction(ISD::SMIN, MVT::i16, Legal);
     341             :     setOperationAction(ISD::SMAX, MVT::i16, Legal);
     342             : 
     343             :     setOperationAction(ISD::UMIN, MVT::i16, Legal);
     344             :     setOperationAction(ISD::UMAX, MVT::i16, Legal);
     345             : 
     346             :     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     347             :     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
     348             : 
     349             :     setOperationAction(ISD::ROTR, MVT::i16, Promote);
     350             :     setOperationAction(ISD::ROTL, MVT::i16, Promote);
     351             : 
     352             :     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     353             :     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     354             :     setOperationAction(ISD::SREM, MVT::i16, Promote);
     355             :     setOperationAction(ISD::UREM, MVT::i16, Promote);
     356             : 
     357             :     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     358             :     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
     359             : 
     360             :     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
     361             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     362             :     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     363             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
     364             : 
     365             :     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
     366             : 
     367             :     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
     368             : 
     369             :     setOperationAction(ISD::LOAD, MVT::i16, Custom);
     370             : 
     371             :     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     372             : 
     373             :     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
     374             :     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
     375             :     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     376             :     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
     377             : 
     378             :     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
     379             :     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     380             :     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
     381             :     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     382             : 
     383             :     // F16 - Constant Actions.
     384             :     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
     385             : 
     386             :     // F16 - Load/Store Actions.
     387             :     setOperationAction(ISD::LOAD, MVT::f16, Promote);
     388             :     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
     389             :     setOperationAction(ISD::STORE, MVT::f16, Promote);
     390             :     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
     391             : 
     392             :     // F16 - VOP1 Actions.
     393             :     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
     394             :     setOperationAction(ISD::FCOS, MVT::f16, Promote);
     395             :     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     396             :     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
     397             :     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     398             :     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     399             :     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
     400             :     setOperationAction(ISD::FROUND, MVT::f16, Custom);
     401             : 
     402             :     // F16 - VOP2 Actions.
     403             :     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     404             :     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     405             :     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     406             :     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     407             :     setOperationAction(ISD::FDIV, MVT::f16, Custom);
     408             : 
     409             :     // F16 - VOP3 Actions.
     410             :     setOperationAction(ISD::FMA, MVT::f16, Legal);
     411         938 :     if (!Subtarget->hasFP16Denormals())
     412             :       setOperationAction(ISD::FMAD, MVT::f16, Legal);
     413             :   }
     414             : 
     415        2046 :   if (Subtarget->hasVOP3PInsts()) {
     416        1000 :     for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
     417      207600 :       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     418      103600 :         switch (Op) {
     419             :         case ISD::LOAD:
     420             :         case ISD::STORE:
     421             :         case ISD::BUILD_VECTOR:
     422             :         case ISD::BITCAST:
     423             :         case ISD::EXTRACT_VECTOR_ELT:
     424             :         case ISD::INSERT_VECTOR_ELT:
     425             :         case ISD::INSERT_SUBVECTOR:
     426             :         case ISD::EXTRACT_SUBVECTOR:
     427             :         case ISD::SCALAR_TO_VECTOR:
     428             :           break;
     429         400 :         case ISD::CONCAT_VECTORS:
     430             :           setOperationAction(Op, VT, Custom);
     431             :           break;
     432       99600 :         default:
     433             :           setOperationAction(Op, VT, Expand);
     434             :           break;
     435             :         }
     436             :       }
     437             :     }
     438             : 
     439             :     // XXX - Do these do anything? Vector constants turn into build_vector.
     440             :     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     441             :     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
     442             : 
     443             :     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     444             :     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     445             :     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
     446             :     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
     447             : 
     448             :     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
     449             :     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
     450             :     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
     451             :     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
     452             : 
     453             :     setOperationAction(ISD::AND, MVT::v2i16, Promote);
     454             :     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
     455             :     setOperationAction(ISD::OR, MVT::v2i16, Promote);
     456             :     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     457             :     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     458             :     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
     459             :     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
     460             :     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
     461             :     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
     462             :     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
     463             : 
     464             :     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     465             :     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     466             :     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
     467             :     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
     468             :     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
     469             :     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
     470             :     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
     471             :     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
     472             :     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     473             :     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
     474             : 
     475             :     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     476             :     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     477             :     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     478             :     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     479             :     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     480             :     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
     481             : 
     482             :     // This isn't really legal, but this avoids the legalizer unrolling it (and
     483             :     // allows matching fneg (fabs x) patterns)
     484             :     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
     485             : 
     486             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     487             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     488             : 
     489             :     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
     490             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     491             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
     492             :     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
     493             :   } else {
     494             :     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     495             :     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
     496             :   }
     497             : 
     498       22506 :   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
     499             :     setOperationAction(ISD::SELECT, VT, Custom);
     500             :   }
     501             : 
     502             :   setTargetDAGCombine(ISD::ADD);
     503             :   setTargetDAGCombine(ISD::ADDCARRY);
     504             :   setTargetDAGCombine(ISD::SUB);
     505             :   setTargetDAGCombine(ISD::SUBCARRY);
     506             :   setTargetDAGCombine(ISD::FADD);
     507             :   setTargetDAGCombine(ISD::FSUB);
     508             :   setTargetDAGCombine(ISD::FMINNUM);
     509             :   setTargetDAGCombine(ISD::FMAXNUM);
     510             :   setTargetDAGCombine(ISD::SMIN);
     511             :   setTargetDAGCombine(ISD::SMAX);
     512             :   setTargetDAGCombine(ISD::UMIN);
     513             :   setTargetDAGCombine(ISD::UMAX);
     514             :   setTargetDAGCombine(ISD::SETCC);
     515             :   setTargetDAGCombine(ISD::AND);
     516             :   setTargetDAGCombine(ISD::OR);
     517             :   setTargetDAGCombine(ISD::XOR);
     518             :   setTargetDAGCombine(ISD::SINT_TO_FP);
     519             :   setTargetDAGCombine(ISD::UINT_TO_FP);
     520             :   setTargetDAGCombine(ISD::FCANONICALIZE);
     521             :   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
     522             :   setTargetDAGCombine(ISD::ZERO_EXTEND);
     523             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     524             :   setTargetDAGCombine(ISD::BUILD_VECTOR);
     525             : 
     526             :   // All memory operations. Some folding on the pointer operand is done to help
     527             :   // matching the constant offsets in the addressing modes.
     528             :   setTargetDAGCombine(ISD::LOAD);
     529             :   setTargetDAGCombine(ISD::STORE);
     530             :   setTargetDAGCombine(ISD::ATOMIC_LOAD);
     531             :   setTargetDAGCombine(ISD::ATOMIC_STORE);
     532             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
     533             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     534             :   setTargetDAGCombine(ISD::ATOMIC_SWAP);
     535             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
     536             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
     537             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
     538             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
     539             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
     540             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
     541             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
     542             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
     543             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
     544             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
     545             : 
     546             :   setSchedulingPreference(Sched::RegPressure);
     547        2046 : }
     548             : 
     549      888194 : const SISubtarget *SITargetLowering::getSubtarget() const {
     550      888194 :   return static_cast<const SISubtarget *>(Subtarget);
     551             : }
     552             : 
     553             : //===----------------------------------------------------------------------===//
     554             : // TargetLowering queries
     555             : //===----------------------------------------------------------------------===//
     556             : 
     557          28 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
     558             :   // SI has some legal vector types, but no legal vector operations. Say no
     559             :   // shuffles are legal in order to prefer scalarizing some vector operations.
     560          28 :   return false;
     561             : }
     562             : 
     563        8645 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     564             :                                           const CallInst &CI,
     565             :                                           MachineFunction &MF,
     566             :                                           unsigned IntrID) const {
     567        8645 :   switch (IntrID) {
     568         245 :   case Intrinsic::amdgcn_atomic_inc:
     569             :   case Intrinsic::amdgcn_atomic_dec:
     570             :   case Intrinsic::amdgcn_ds_fadd:
     571             :   case Intrinsic::amdgcn_ds_fmin:
     572             :   case Intrinsic::amdgcn_ds_fmax: {
     573         245 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     574         245 :     Info.memVT = MVT::getVT(CI.getType());
     575             :     Info.ptrVal = CI.getOperand(0);
     576         245 :     Info.align = 0;
     577         245 :     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     578             : 
     579             :     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     580         242 :     if (!Vol || !Vol->isZero())
     581             :       Info.flags |= MachineMemOperand::MOVolatile;
     582             : 
     583             :     return true;
     584             :   }
     585             : 
     586             :   // Image load.
     587         454 :   case Intrinsic::amdgcn_image_load:
     588             :   case Intrinsic::amdgcn_image_load_mip:
     589             : 
     590             :   // Sample.
     591             :   case Intrinsic::amdgcn_image_sample:
     592             :   case Intrinsic::amdgcn_image_sample_cl:
     593             :   case Intrinsic::amdgcn_image_sample_d:
     594             :   case Intrinsic::amdgcn_image_sample_d_cl:
     595             :   case Intrinsic::amdgcn_image_sample_l:
     596             :   case Intrinsic::amdgcn_image_sample_b:
     597             :   case Intrinsic::amdgcn_image_sample_b_cl:
     598             :   case Intrinsic::amdgcn_image_sample_lz:
     599             :   case Intrinsic::amdgcn_image_sample_cd:
     600             :   case Intrinsic::amdgcn_image_sample_cd_cl:
     601             : 
     602             :     // Sample with comparison.
     603             :   case Intrinsic::amdgcn_image_sample_c:
     604             :   case Intrinsic::amdgcn_image_sample_c_cl:
     605             :   case Intrinsic::amdgcn_image_sample_c_d:
     606             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
     607             :   case Intrinsic::amdgcn_image_sample_c_l:
     608             :   case Intrinsic::amdgcn_image_sample_c_b:
     609             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
     610             :   case Intrinsic::amdgcn_image_sample_c_lz:
     611             :   case Intrinsic::amdgcn_image_sample_c_cd:
     612             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
     613             : 
     614             :     // Sample with offsets.
     615             :   case Intrinsic::amdgcn_image_sample_o:
     616             :   case Intrinsic::amdgcn_image_sample_cl_o:
     617             :   case Intrinsic::amdgcn_image_sample_d_o:
     618             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
     619             :   case Intrinsic::amdgcn_image_sample_l_o:
     620             :   case Intrinsic::amdgcn_image_sample_b_o:
     621             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
     622             :   case Intrinsic::amdgcn_image_sample_lz_o:
     623             :   case Intrinsic::amdgcn_image_sample_cd_o:
     624             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
     625             : 
     626             :     // Sample with comparison and offsets.
     627             :   case Intrinsic::amdgcn_image_sample_c_o:
     628             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
     629             :   case Intrinsic::amdgcn_image_sample_c_d_o:
     630             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
     631             :   case Intrinsic::amdgcn_image_sample_c_l_o:
     632             :   case Intrinsic::amdgcn_image_sample_c_b_o:
     633             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
     634             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
     635             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
     636             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
     637             : 
     638             :     // Basic gather4
     639             :   case Intrinsic::amdgcn_image_gather4:
     640             :   case Intrinsic::amdgcn_image_gather4_cl:
     641             :   case Intrinsic::amdgcn_image_gather4_l:
     642             :   case Intrinsic::amdgcn_image_gather4_b:
     643             :   case Intrinsic::amdgcn_image_gather4_b_cl:
     644             :   case Intrinsic::amdgcn_image_gather4_lz:
     645             : 
     646             :     // Gather4 with comparison
     647             :   case Intrinsic::amdgcn_image_gather4_c:
     648             :   case Intrinsic::amdgcn_image_gather4_c_cl:
     649             :   case Intrinsic::amdgcn_image_gather4_c_l:
     650             :   case Intrinsic::amdgcn_image_gather4_c_b:
     651             :   case Intrinsic::amdgcn_image_gather4_c_b_cl:
     652             :   case Intrinsic::amdgcn_image_gather4_c_lz:
     653             : 
     654             :     // Gather4 with offsets
     655             :   case Intrinsic::amdgcn_image_gather4_o:
     656             :   case Intrinsic::amdgcn_image_gather4_cl_o:
     657             :   case Intrinsic::amdgcn_image_gather4_l_o:
     658             :   case Intrinsic::amdgcn_image_gather4_b_o:
     659             :   case Intrinsic::amdgcn_image_gather4_b_cl_o:
     660             :   case Intrinsic::amdgcn_image_gather4_lz_o:
     661             : 
     662             :     // Gather4 with comparison and offsets
     663             :   case Intrinsic::amdgcn_image_gather4_c_o:
     664             :   case Intrinsic::amdgcn_image_gather4_c_cl_o:
     665             :   case Intrinsic::amdgcn_image_gather4_c_l_o:
     666             :   case Intrinsic::amdgcn_image_gather4_c_b_o:
     667             :   case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
     668             :   case Intrinsic::amdgcn_image_gather4_c_lz_o: {
     669         454 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     670         454 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     671         454 :     Info.memVT = MVT::getVT(CI.getType());
     672         454 :     Info.ptrVal = MFI->getImagePSV(
     673             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     674             :       CI.getArgOperand(1));
     675         454 :     Info.align = 0;
     676         454 :     Info.flags = MachineMemOperand::MOLoad |
     677             :                  MachineMemOperand::MODereferenceable;
     678         454 :     return true;
     679             :   }
     680          40 :   case Intrinsic::amdgcn_image_store:
     681             :   case Intrinsic::amdgcn_image_store_mip: {
     682          40 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     683          40 :     Info.opc = ISD::INTRINSIC_VOID;
     684          40 :     Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
     685          40 :     Info.ptrVal = MFI->getImagePSV(
     686             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     687             :       CI.getArgOperand(2));
     688          40 :     Info.flags = MachineMemOperand::MOStore |
     689             :                  MachineMemOperand::MODereferenceable;
     690          40 :     Info.align = 0;
     691          40 :     return true;
     692             :   }
     693          28 :   case Intrinsic::amdgcn_image_atomic_swap:
     694             :   case Intrinsic::amdgcn_image_atomic_add:
     695             :   case Intrinsic::amdgcn_image_atomic_sub:
     696             :   case Intrinsic::amdgcn_image_atomic_smin:
     697             :   case Intrinsic::amdgcn_image_atomic_umin:
     698             :   case Intrinsic::amdgcn_image_atomic_smax:
     699             :   case Intrinsic::amdgcn_image_atomic_umax:
     700             :   case Intrinsic::amdgcn_image_atomic_and:
     701             :   case Intrinsic::amdgcn_image_atomic_or:
     702             :   case Intrinsic::amdgcn_image_atomic_xor:
     703             :   case Intrinsic::amdgcn_image_atomic_inc:
     704             :   case Intrinsic::amdgcn_image_atomic_dec: {
     705          28 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     706          28 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     707          28 :     Info.memVT = MVT::getVT(CI.getType());
     708          28 :     Info.ptrVal = MFI->getImagePSV(
     709             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     710             :       CI.getArgOperand(2));
     711             : 
     712             :     Info.flags = MachineMemOperand::MOLoad |
     713             :                  MachineMemOperand::MOStore |
     714             :                  MachineMemOperand::MODereferenceable;
     715             : 
     716             :     // XXX - Should this be volatile without known ordering?
     717             :     Info.flags |= MachineMemOperand::MOVolatile;
     718          28 :     return true;
     719             :   }
     720           2 :   case Intrinsic::amdgcn_image_atomic_cmpswap: {
     721           2 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     722           2 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     723           2 :     Info.memVT = MVT::getVT(CI.getType());
     724           2 :     Info.ptrVal = MFI->getImagePSV(
     725             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     726             :       CI.getArgOperand(3));
     727             : 
     728             :     Info.flags = MachineMemOperand::MOLoad |
     729             :                  MachineMemOperand::MOStore |
     730             :                  MachineMemOperand::MODereferenceable;
     731             : 
     732             :     // XXX - Should this be volatile without known ordering?
     733             :     Info.flags |= MachineMemOperand::MOVolatile;
     734           2 :     return true;
     735             :   }
     736         215 :   case Intrinsic::amdgcn_tbuffer_load:
     737             :   case Intrinsic::amdgcn_buffer_load:
     738             :   case Intrinsic::amdgcn_buffer_load_format: {
     739         215 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     740         215 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     741         215 :     Info.ptrVal = MFI->getBufferPSV(
     742             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     743             :       CI.getArgOperand(0));
     744         215 :     Info.memVT = MVT::getVT(CI.getType());
     745         215 :     Info.flags = MachineMemOperand::MOLoad |
     746             :                  MachineMemOperand::MODereferenceable;
     747             : 
     748             :     // There is a constant offset component, but there are additional register
     749             :     // offsets which could break AA if we set the offset to anything non-0.
     750         215 :     return true;
     751             :   }
     752         180 :   case Intrinsic::amdgcn_tbuffer_store:
     753             :   case Intrinsic::amdgcn_buffer_store:
     754             :   case Intrinsic::amdgcn_buffer_store_format: {
     755         180 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     756         180 :     Info.opc = ISD::INTRINSIC_VOID;
     757         180 :     Info.ptrVal = MFI->getBufferPSV(
     758             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     759             :       CI.getArgOperand(1));
     760         180 :     Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
     761         180 :     Info.flags = MachineMemOperand::MOStore |
     762             :                  MachineMemOperand::MODereferenceable;
     763         180 :     return true;
     764             :   }
     765          34 :   case Intrinsic::amdgcn_buffer_atomic_swap:
     766             :   case Intrinsic::amdgcn_buffer_atomic_add:
     767             :   case Intrinsic::amdgcn_buffer_atomic_sub:
     768             :   case Intrinsic::amdgcn_buffer_atomic_smin:
     769             :   case Intrinsic::amdgcn_buffer_atomic_umin:
     770             :   case Intrinsic::amdgcn_buffer_atomic_smax:
     771             :   case Intrinsic::amdgcn_buffer_atomic_umax:
     772             :   case Intrinsic::amdgcn_buffer_atomic_and:
     773             :   case Intrinsic::amdgcn_buffer_atomic_or:
     774             :   case Intrinsic::amdgcn_buffer_atomic_xor: {
     775          34 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     776          34 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     777          34 :     Info.ptrVal = MFI->getBufferPSV(
     778             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     779             :       CI.getArgOperand(1));
     780          34 :     Info.memVT = MVT::getVT(CI.getType());
     781          34 :     Info.flags = MachineMemOperand::MOLoad |
     782             :                  MachineMemOperand::MOStore |
     783             :                  MachineMemOperand::MODereferenceable |
     784             :                  MachineMemOperand::MOVolatile;
     785          34 :     return true;
     786             :   }
     787          12 :   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
     788          12 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     789          12 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     790          12 :     Info.ptrVal = MFI->getBufferPSV(
     791             :       *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     792             :       CI.getArgOperand(2));
     793          12 :     Info.memVT = MVT::getVT(CI.getType());
     794          12 :     Info.flags = MachineMemOperand::MOLoad |
     795             :                  MachineMemOperand::MOStore |
     796             :                  MachineMemOperand::MODereferenceable |
     797             :                  MachineMemOperand::MOVolatile;
     798          12 :     return true;
     799             :   }
     800             :   default:
     801             :     return false;
     802             :   }
     803             : }
     804             : 
     805       12767 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
     806             :                                             SmallVectorImpl<Value*> &Ops,
     807             :                                             Type *&AccessTy) const {
     808             :   switch (II->getIntrinsicID()) {
     809         269 :   case Intrinsic::amdgcn_atomic_inc:
     810             :   case Intrinsic::amdgcn_atomic_dec:
     811             :   case Intrinsic::amdgcn_ds_fadd:
     812             :   case Intrinsic::amdgcn_ds_fmin:
     813             :   case Intrinsic::amdgcn_ds_fmax: {
     814         538 :     Value *Ptr = II->getArgOperand(0);
     815         269 :     AccessTy = II->getType();
     816         269 :     Ops.push_back(Ptr);
     817             :     return true;
     818             :   }
     819             :   default:
     820             :     return false;
     821             :   }
     822             : }
     823             : 
     824       32096 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
     825       32096 :   if (!Subtarget->hasFlatInstOffsets()) {
     826             :     // Flat instructions do not have offsets, and only have the register
     827             :     // address.
     828       30848 :     return AM.BaseOffs == 0 && AM.Scale == 0;
     829             :   }
     830             : 
     831             :   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
     832             :   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
     833             : 
     834             :   // Just r + i
     835        1248 :   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
     836             : }
     837             : 
     838       67703 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
     839       67703 :   if (Subtarget->hasFlatGlobalInsts())
     840       21546 :     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
     841             : 
     842       56930 :   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
     843             :       // Assume the we will use FLAT for all global memory accesses
     844             :       // on VI.
     845             :       // FIXME: This assumption is currently wrong.  On VI we still use
     846             :       // MUBUF instructions for the r + i addressing mode.  As currently
     847             :       // implemented, the MUBUF instructions only work on buffer < 4GB.
     848             :       // It may be possible to support > 4GB buffers with MUBUF instructions,
     849             :       // by setting the stride value in the resource descriptor which would
     850             :       // increase the size limit to (stride * 4GB).  However, this is risky,
     851             :       // because it has never been validated.
     852       25510 :     return isLegalFlatAddressingMode(AM);
     853             :   }
     854             : 
     855       31420 :   return isLegalMUBUFAddressingMode(AM);
     856             : }
     857             : 
     858       36576 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
     859             :   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     860             :   // additionally can do r + r + i with addr64. 32-bit has more addressing
     861             :   // mode options. Depending on the resource constant, it can also do
     862             :   // (i64 r0) + (i32 r1) * (i14 i).
     863             :   //
     864             :   // Private arrays end up using a scratch buffer most of the time, so also
     865             :   // assume those use MUBUF instructions. Scratch loads / stores are currently
     866             :   // implemented as mubuf instructions with offen bit set, so slightly
     867             :   // different than the normal addr64.
     868       36576 :   if (!isUInt<12>(AM.BaseOffs))
     869             :     return false;
     870             : 
     871             :   // FIXME: Since we can split immediate into soffset and immediate offset,
     872             :   // would it make sense to allow any immediate?
     873             : 
     874       36161 :   switch (AM.Scale) {
     875             :   case 0: // r + i or just i, depending on HasBaseReg.
     876             :     return true;
     877             :   case 1:
     878             :     return true; // We have r + r or r + i.
     879         785 :   case 2:
     880         785 :     if (AM.HasBaseReg) {
     881             :       // Reject 2 * r + r.
     882             :       return false;
     883             :     }
     884             : 
     885             :     // Allow 2 * r as r + r
     886             :     // Or  2 * r + i is allowed as r + r + i.
     887           0 :     return true;
     888       13258 :   default: // Don't allow n * r
     889       13258 :     return false;
     890             :   }
     891             : }
     892             : 
     893      101064 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     894             :                                              const AddrMode &AM, Type *Ty,
     895             :                                              unsigned AS, Instruction *I) const {
     896             :   // No global is ever allowed as a base.
     897      101064 :   if (AM.BaseGV)
     898             :     return false;
     899             : 
     900       98529 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     901       67530 :     return isLegalGlobalAddressingMode(AM);
     902             : 
     903       61998 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
     904       30999 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
     905             :     // If the offset isn't a multiple of 4, it probably isn't going to be
     906             :     // correctly aligned.
     907             :     // FIXME: Can we get the real alignment here?
     908        3387 :     if (AM.BaseOffs % 4 != 0)
     909          72 :       return isLegalMUBUFAddressingMode(AM);
     910             : 
     911             :     // There are no SMRD extloads, so if we have to do a small type access we
     912             :     // will use a MUBUF load.
     913             :     // FIXME?: We also need to do this if unaligned, but we don't know the
     914             :     // alignment here.
     915        3315 :     if (DL.getTypeStoreSize(Ty) < 4)
     916         173 :       return isLegalGlobalAddressingMode(AM);
     917             : 
     918        3142 :     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     919             :       // SMRD instructions have an 8-bit, dword offset on SI.
     920         923 :       if (!isUInt<8>(AM.BaseOffs / 4))
     921             :         return false;
     922        2219 :     } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
     923             :       // On CI+, this can also be a 32-bit literal constant offset. If it fits
     924             :       // in 8-bits, it can use a smaller encoding.
     925         879 :       if (!isUInt<32>(AM.BaseOffs / 4))
     926             :         return false;
     927        1340 :     } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     928             :       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
     929        1340 :       if (!isUInt<20>(AM.BaseOffs))
     930             :         return false;
     931             :     } else
     932           0 :       llvm_unreachable("unhandled generation");
     933             : 
     934        3030 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     935             :       return true;
     936             : 
     937         337 :     if (AM.Scale == 1 && AM.HasBaseReg)
     938             :       return true;
     939             : 
     940         337 :     return false;
     941             : 
     942       27612 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     943        5084 :     return isLegalMUBUFAddressingMode(AM);
     944       29116 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
     945        6588 :              AS == AMDGPUASI.REGION_ADDRESS) {
     946             :     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     947             :     // field.
     948             :     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
     949             :     // an 8-bit dword offset but we don't know the alignment here.
     950       15942 :     if (!isUInt<16>(AM.BaseOffs))
     951             :       return false;
     952             : 
     953       14551 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     954             :       return true;
     955             : 
     956        3584 :     if (AM.Scale == 1 && AM.HasBaseReg)
     957             :       return true;
     958             : 
     959        2245 :     return false;
     960        6586 :   } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
     961             :              AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     962             :     // For an unknown address space, this usually means that this is for some
     963             :     // reason being used for pure arithmetic, and not based on some addressing
     964             :     // computation. We don't have instructions that compute pointers with any
     965             :     // addressing modes, so treat them as having no offset like flat
     966             :     // instructions.
     967        6586 :     return isLegalFlatAddressingMode(AM);
     968             :   } else {
     969           0 :     llvm_unreachable("unhandled address space");
     970             :   }
     971             : }
     972             : 
     973       12606 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
     974             :                                         const SelectionDAG &DAG) const {
     975       12606 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
     976        7079 :     return (MemVT.getSizeInBits() <= 4 * 32);
     977        5527 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     978        1148 :     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
     979        1148 :     return (MemVT.getSizeInBits() <= MaxPrivateBits);
     980        4379 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     981        4379 :     return (MemVT.getSizeInBits() <= 2 * 32);
     982             :   }
     983             :   return true;
     984             : }
     985             : 
     986       25071 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     987             :                                                       unsigned AddrSpace,
     988             :                                                       unsigned Align,
     989             :                                                       bool *IsFast) const {
     990       25071 :   if (IsFast)
     991       18858 :     *IsFast = false;
     992             : 
     993             :   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
     994             :   // which isn't a simple VT.
     995             :   // Until MVT is extended to handle this, simply check for the size and
     996             :   // rely on the condition below: allow accesses if the size is a multiple of 4.
     997       25071 :   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
     998             :                            VT.getStoreSize() > 16)) {
     999             :     return false;
    1000             :   }
    1001             : 
    1002       42202 :   if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
    1003       17131 :       AddrSpace == AMDGPUASI.REGION_ADDRESS) {
    1004             :     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
    1005             :     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
    1006             :     // with adjacent offsets.
    1007        7940 :     bool AlignedBy4 = (Align % 4 == 0);
    1008        7940 :     if (IsFast)
    1009        5507 :       *IsFast = AlignedBy4;
    1010             : 
    1011             :     return AlignedBy4;
    1012             :   }
    1013             : 
    1014             :   // FIXME: We have to be conservative here and assume that flat operations
    1015             :   // will access scratch.  If we had access to the IR function, then we
    1016             :   // could determine if any private memory was used in the function.
    1017       34230 :   if (!Subtarget->hasUnalignedScratchAccess() &&
    1018       33892 :       (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
    1019       16793 :        AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
    1020             :     return false;
    1021             :   }
    1022             : 
    1023       16801 :   if (Subtarget->hasUnalignedBufferAccess()) {
    1024             :     // If we have an uniform constant load, it still requires using a slow
    1025             :     // buffer instruction if unaligned.
    1026        5290 :     if (IsFast) {
    1027       11439 :       *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
    1028        7813 :                  AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
    1029         187 :         (Align % 4 == 0) : true;
    1030             :     }
    1031             : 
    1032             :     return true;
    1033             :   }
    1034             : 
    1035             :   // Smaller than dword value must be aligned.
    1036       11511 :   if (VT.bitsLT(MVT::i32))
    1037             :     return false;
    1038             : 
    1039             :   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
    1040             :   // byte-address are ignored, thus forcing Dword alignment.
    1041             :   // This applies to private, global, and constant memory.
    1042        9724 :   if (IsFast)
    1043        7673 :     *IsFast = true;
    1044             : 
    1045        9724 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1046             : }
    1047             : 
    1048         112 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
    1049             :                                           unsigned SrcAlign, bool IsMemset,
    1050             :                                           bool ZeroMemset,
    1051             :                                           bool MemcpyStrSrc,
    1052             :                                           MachineFunction &MF) const {
    1053             :   // FIXME: Should account for address space here.
    1054             : 
    1055             :   // The default fallback uses the private pointer size as a guess for a type to
    1056             :   // use. Make sure we switch these to 64-bit accesses.
    1057             : 
    1058         112 :   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
    1059          86 :     return MVT::v4i32;
    1060             : 
    1061          26 :   if (Size >= 8 && DstAlign >= 4)
    1062           8 :     return MVT::v2i32;
    1063             : 
    1064             :   // Use the default.
    1065          18 :   return MVT::Other;
    1066             : }
    1067             : 
    1068             : static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
    1069         482 :   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
    1070             :          AS == AMDGPUASI.FLAT_ADDRESS ||
    1071         872 :          AS == AMDGPUASI.CONSTANT_ADDRESS ||
    1072         196 :          AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
    1073             : }
    1074             : 
    1075         219 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
    1076             :                                            unsigned DestAS) const {
    1077             :   return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
    1078         219 :          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
    1079             : }
    1080             : 
    1081        3156 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
    1082             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1083             :   const Value *Ptr = MemNode->getMemOperand()->getValue();
    1084             :   const Instruction *I = dyn_cast<Instruction>(Ptr);
    1085        2377 :   return I && I->getMetadata("amdgpu.noclobber");
    1086             : }
    1087             : 
    1088          77 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
    1089             :                                             unsigned DestAS) const {
    1090             :   // Flat -> private/local is a simple truncate.
    1091             :   // Flat -> global is no-op
    1092          77 :   if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
    1093             :     return true;
    1094             : 
    1095          21 :   return isNoopAddrSpaceCast(SrcAS, DestAS);
    1096             : }
    1097             : 
    1098      121931 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
    1099             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1100             : 
    1101      121931 :   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
    1102             : }
    1103             : 
    1104             : TargetLoweringBase::LegalizeTypeAction
    1105      173510 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
    1106      316330 :   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
    1107             :     return TypeSplitVector;
    1108             : 
    1109             :   return TargetLoweringBase::getPreferredVectorAction(VT);
    1110             : }
    1111             : 
    1112          32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    1113             :                                                          Type *Ty) const {
    1114             :   // FIXME: Could be smarter if called for vector constants.
    1115          32 :   return true;
    1116             : }
    1117             : 
    1118      256442 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
    1119      256442 :   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
    1120       15266 :     switch (Op) {
    1121             :     case ISD::LOAD:
    1122             :     case ISD::STORE:
    1123             : 
    1124             :     // These operations are done with 32-bit instructions anyway.
    1125             :     case ISD::AND:
    1126             :     case ISD::OR:
    1127             :     case ISD::XOR:
    1128             :     case ISD::SELECT:
    1129             :       // TODO: Extensions?
    1130             :       return true;
    1131       10380 :     default:
    1132       10380 :       return false;
    1133             :     }
    1134             :   }
    1135             : 
    1136             :   // SimplifySetCC uses this function to determine whether or not it should
    1137             :   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
    1138         472 :   if (VT == MVT::i1 && Op == ISD::SETCC)
    1139             :     return false;
    1140             : 
    1141             :   return TargetLowering::isTypeDesirableForOp(Op, VT);
    1142             : }
    1143             : 
    1144       34493 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
    1145             :                                                    const SDLoc &SL,
    1146             :                                                    SDValue Chain,
    1147             :                                                    uint64_t Offset) const {
    1148             :   const DataLayout &DL = DAG.getDataLayout();
    1149             :   MachineFunction &MF = DAG.getMachineFunction();
    1150       34493 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1151             : 
    1152             :   const ArgDescriptor *InputPtrReg;
    1153             :   const TargetRegisterClass *RC;
    1154             : 
    1155             :   std::tie(InputPtrReg, RC)
    1156             :     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1157             : 
    1158             :   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    1159             :   MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
    1160             :   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
    1161       34493 :     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
    1162             : 
    1163             :   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
    1164       68986 :                      DAG.getConstant(Offset, SL, PtrVT));
    1165             : }
    1166             : 
    1167          34 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
    1168             :                                             const SDLoc &SL) const {
    1169          34 :   auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
    1170          34 :   uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
    1171          34 :   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
    1172             : }
    1173             : 
    1174       34459 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
    1175             :                                          const SDLoc &SL, SDValue Val,
    1176             :                                          bool Signed,
    1177             :                                          const ISD::InputArg *Arg) const {
    1178      103238 :   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
    1179          83 :       VT.bitsLT(MemVT)) {
    1180          46 :     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
    1181          46 :     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
    1182             :   }
    1183             : 
    1184       34459 :   if (MemVT.isFloatingPoint())
    1185        2571 :     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
    1186       31888 :   else if (Signed)
    1187          16 :     Val = DAG.getSExtOrTrunc(Val, SL, VT);
    1188             :   else
    1189       31872 :     Val = DAG.getZExtOrTrunc(Val, SL, VT);
    1190             : 
    1191       34459 :   return Val;
    1192             : }
    1193             : 
    1194       34459 : SDValue SITargetLowering::lowerKernargMemParameter(
    1195             :   SelectionDAG &DAG, EVT VT, EVT MemVT,
    1196             :   const SDLoc &SL, SDValue Chain,
    1197             :   uint64_t Offset, bool Signed,
    1198             :   const ISD::InputArg *Arg) const {
    1199             :   const DataLayout &DL = DAG.getDataLayout();
    1200       34459 :   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
    1201       34459 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    1202       34459 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    1203             : 
    1204       34459 :   unsigned Align = DL.getABITypeAlignment(Ty);
    1205             : 
    1206       34459 :   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
    1207             :   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
    1208             :                              MachineMemOperand::MODereferenceable |
    1209       34459 :                              MachineMemOperand::MOInvariant);
    1210             : 
    1211       34459 :   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
    1212       68918 :   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
    1213             : }
    1214             : 
    1215         211 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
    1216             :                                               const SDLoc &SL, SDValue Chain,
    1217             :                                               const ISD::InputArg &Arg) const {
    1218             :   MachineFunction &MF = DAG.getMachineFunction();
    1219             :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1220             : 
    1221         211 :   if (Arg.Flags.isByVal()) {
    1222             :     unsigned Size = Arg.Flags.getByValSize();
    1223          60 :     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
    1224          60 :     return DAG.getFrameIndex(FrameIdx, MVT::i32);
    1225             :   }
    1226             : 
    1227             :   unsigned ArgOffset = VA.getLocMemOffset();
    1228         302 :   unsigned ArgSize = VA.getValVT().getStoreSize();
    1229             : 
    1230         151 :   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
    1231             : 
    1232             :   // Create load nodes to retrieve arguments from the stack.
    1233         151 :   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
    1234             :   SDValue ArgValue;
    1235             : 
    1236             :   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
    1237             :   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
    1238             :   MVT MemVT = VA.getValVT();
    1239             : 
    1240         151 :   switch (VA.getLocInfo()) {
    1241             :   default:
    1242             :     break;
    1243           0 :   case CCValAssign::BCvt:
    1244             :     MemVT = VA.getLocVT();
    1245           0 :     break;
    1246           0 :   case CCValAssign::SExt:
    1247             :     ExtType = ISD::SEXTLOAD;
    1248           0 :     break;
    1249           0 :   case CCValAssign::ZExt:
    1250             :     ExtType = ISD::ZEXTLOAD;
    1251           0 :     break;
    1252           3 :   case CCValAssign::AExt:
    1253             :     ExtType = ISD::EXTLOAD;
    1254           3 :     break;
    1255             :   }
    1256             : 
    1257         151 :   ArgValue = DAG.getExtLoad(
    1258             :     ExtType, SL, VA.getLocVT(), Chain, FIN,
    1259             :     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
    1260         302 :     MemVT);
    1261         151 :   return ArgValue;
    1262             : }
    1263             : 
    1264         195 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
    1265             :   const SIMachineFunctionInfo &MFI,
    1266             :   EVT VT,
    1267             :   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
    1268             :   const ArgDescriptor *Reg;
    1269             :   const TargetRegisterClass *RC;
    1270             : 
    1271             :   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
    1272         195 :   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
    1273             : }
    1274             : 
    1275         799 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
    1276             :                                    CallingConv::ID CallConv,
    1277             :                                    ArrayRef<ISD::InputArg> Ins,
    1278             :                                    BitVector &Skipped,
    1279             :                                    FunctionType *FType,
    1280             :                                    SIMachineFunctionInfo *Info) {
    1281        3897 :   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    1282        3098 :     const ISD::InputArg &Arg = Ins[I];
    1283             : 
    1284             :     // First check if it's a PS input addr.
    1285        4401 :     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
    1286        4857 :         !Arg.Flags.isByVal() && PSInputNum <= 15) {
    1287             : 
    1288        3840 :       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
    1289             :         // We can safely skip PS inputs.
    1290             :         Skipped.set(I);
    1291         983 :         ++PSInputNum;
    1292         983 :         continue;
    1293             :       }
    1294             : 
    1295             :       Info->markPSInputAllocated(PSInputNum);
    1296         776 :       if (Arg.Used)
    1297             :         Info->markPSInputEnabled(PSInputNum);
    1298             : 
    1299         776 :       ++PSInputNum;
    1300             :     }
    1301             : 
    1302             :     // Second split vertices into their elements.
    1303        2115 :     if (Arg.VT.isVector()) {
    1304         595 :       ISD::InputArg NewArg = Arg;
    1305             :       NewArg.Flags.setSplit();
    1306         595 :       NewArg.VT = Arg.VT.getVectorElementType();
    1307             : 
    1308             :       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
    1309             :       // three or five element vertex only needs three or five registers,
    1310             :       // NOT four or eight.
    1311             :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1312             :       unsigned NumElements = ParamType->getVectorNumElements();
    1313             : 
    1314        5387 :       for (unsigned J = 0; J != NumElements; ++J) {
    1315        2396 :         Splits.push_back(NewArg);
    1316        2396 :         NewArg.PartOffset += NewArg.VT.getStoreSize();
    1317             :       }
    1318             :     } else {
    1319        1520 :       Splits.push_back(Arg);
    1320             :     }
    1321             :   }
    1322         799 : }
    1323             : 
    1324             : // Allocate special inputs passed in VGPRs.
    1325       15458 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
    1326             :                                            MachineFunction &MF,
    1327             :                                            const SIRegisterInfo &TRI,
    1328             :                                            SIMachineFunctionInfo &Info) {
    1329       15458 :   if (Info.hasWorkItemIDX()) {
    1330             :     unsigned Reg = AMDGPU::VGPR0;
    1331       14659 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1332             : 
    1333       14659 :     CCInfo.AllocateReg(Reg);
    1334             :     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
    1335             :   }
    1336             : 
    1337       15458 :   if (Info.hasWorkItemIDY()) {
    1338             :     unsigned Reg = AMDGPU::VGPR1;
    1339         130 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1340             : 
    1341         130 :     CCInfo.AllocateReg(Reg);
    1342             :     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
    1343             :   }
    1344             : 
    1345       15458 :   if (Info.hasWorkItemIDZ()) {
    1346             :     unsigned Reg = AMDGPU::VGPR2;
    1347          76 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1348             : 
    1349          76 :     CCInfo.AllocateReg(Reg);
    1350             :     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
    1351             :   }
    1352       15458 : }
    1353             : 
    1354             : // Try to allocate a VGPR at the end of the argument list, or if no argument
    1355             : // VGPRs are left allocating a stack slot.
    1356          31 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
    1357             :   ArrayRef<MCPhysReg> ArgVGPRs
    1358             :     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    1359             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
    1360          31 :   if (RegIdx == ArgVGPRs.size()) {
    1361             :     // Spill to stack required.
    1362           8 :     int64_t Offset = CCInfo.AllocateStack(4, 4);
    1363             : 
    1364             :     return ArgDescriptor::createStack(Offset);
    1365             :   }
    1366             : 
    1367          23 :   unsigned Reg = ArgVGPRs[RegIdx];
    1368          23 :   Reg = CCInfo.AllocateReg(Reg);
    1369             :   assert(Reg != AMDGPU::NoRegister);
    1370             : 
    1371             :   MachineFunction &MF = CCInfo.getMachineFunction();
    1372          23 :   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1373             :   return ArgDescriptor::createRegister(Reg);
    1374             : }
    1375             : 
    1376         115 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
    1377             :                                              const TargetRegisterClass *RC,
    1378             :                                              unsigned NumArgRegs) {
    1379             :   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
    1380             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
    1381         115 :   if (RegIdx == ArgSGPRs.size())
    1382           0 :     report_fatal_error("ran out of SGPRs for arguments");
    1383             : 
    1384         115 :   unsigned Reg = ArgSGPRs[RegIdx];
    1385         115 :   Reg = CCInfo.AllocateReg(Reg);
    1386             :   assert(Reg != AMDGPU::NoRegister);
    1387             : 
    1388             :   MachineFunction &MF = CCInfo.getMachineFunction();
    1389         115 :   MF.addLiveIn(Reg, RC);
    1390         115 :   return ArgDescriptor::createRegister(Reg);
    1391             : }
    1392             : 
    1393             : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
    1394          62 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
    1395             : }
    1396             : 
    1397             : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
    1398          53 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
    1399             : }
    1400             : 
    1401        1167 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
    1402             :                                       MachineFunction &MF,
    1403             :                                       const SIRegisterInfo &TRI,
    1404             :                                       SIMachineFunctionInfo &Info) {
    1405        1167 :   if (Info.hasWorkItemIDX())
    1406          15 :     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
    1407             : 
    1408        1167 :   if (Info.hasWorkItemIDY())
    1409           8 :     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
    1410             : 
    1411        1167 :   if (Info.hasWorkItemIDZ())
    1412           8 :     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
    1413        1167 : }
    1414             : 
    1415        1167 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
    1416             :                                       MachineFunction &MF,
    1417             :                                       const SIRegisterInfo &TRI,
    1418             :                                       SIMachineFunctionInfo &Info) {
    1419             :   auto &ArgInfo = Info.getArgInfo();
    1420             : 
    1421             :   // TODO: Unify handling with private memory pointers.
    1422             : 
    1423        1167 :   if (Info.hasDispatchPtr())
    1424          10 :     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
    1425             : 
    1426        1167 :   if (Info.hasQueuePtr())
    1427          11 :     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
    1428             : 
    1429        1167 :   if (Info.hasKernargSegmentPtr())
    1430          13 :     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
    1431             : 
    1432        1167 :   if (Info.hasDispatchID())
    1433          10 :     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
    1434             : 
    1435             :   // flat_scratch_init is not applicable for non-kernel functions.
    1436             : 
    1437        1167 :   if (Info.hasWorkGroupIDX())
    1438          22 :     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
    1439             : 
    1440        1167 :   if (Info.hasWorkGroupIDY())
    1441          20 :     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
    1442             : 
    1443        1167 :   if (Info.hasWorkGroupIDZ())
    1444          20 :     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
    1445             : 
    1446        1167 :   if (Info.hasImplicitArgPtr())
    1447           9 :     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
    1448        1167 : }
    1449             : 
    1450             : // Allocate special inputs passed in user SGPRs.
    1451       15458 : static void allocateHSAUserSGPRs(CCState &CCInfo,
    1452             :                                  MachineFunction &MF,
    1453             :                                  const SIRegisterInfo &TRI,
    1454             :                                  SIMachineFunctionInfo &Info) {
    1455       15458 :   if (Info.hasImplicitBufferPtr()) {
    1456           2 :     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
    1457           2 :     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    1458           2 :     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    1459             :   }
    1460             : 
    1461             :   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    1462       15458 :   if (Info.hasPrivateSegmentBuffer()) {
    1463        1868 :     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    1464        1868 :     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    1465        1868 :     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    1466             :   }
    1467             : 
    1468       15458 :   if (Info.hasDispatchPtr()) {
    1469          42 :     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
    1470          42 :     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    1471          42 :     CCInfo.AllocateReg(DispatchPtrReg);
    1472             :   }
    1473             : 
    1474       15458 :   if (Info.hasQueuePtr()) {
    1475          57 :     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
    1476          57 :     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    1477          57 :     CCInfo.AllocateReg(QueuePtrReg);
    1478             :   }
    1479             : 
    1480       15458 :   if (Info.hasKernargSegmentPtr()) {
    1481       13731 :     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
    1482       13731 :     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
    1483       13731 :     CCInfo.AllocateReg(InputPtrReg);
    1484             :   }
    1485             : 
    1486       15458 :   if (Info.hasDispatchID()) {
    1487           5 :     unsigned DispatchIDReg = Info.addDispatchID(TRI);
    1488           5 :     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    1489           5 :     CCInfo.AllocateReg(DispatchIDReg);
    1490             :   }
    1491             : 
    1492       15458 :   if (Info.hasFlatScratchInit()) {
    1493         349 :     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    1494         349 :     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    1495         349 :     CCInfo.AllocateReg(FlatScratchInitReg);
    1496             :   }
    1497             : 
    1498             :   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    1499             :   // these from the dispatch pointer.
    1500       15458 : }
    1501             : 
    1502             : // Allocate special input registers that are initialized per-wave.
    1503       15458 : static void allocateSystemSGPRs(CCState &CCInfo,
    1504             :                                 MachineFunction &MF,
    1505             :                                 SIMachineFunctionInfo &Info,
    1506             :                                 CallingConv::ID CallConv,
    1507             :                                 bool IsShader) {
    1508       15458 :   if (Info.hasWorkGroupIDX()) {
    1509             :     unsigned Reg = Info.addWorkGroupIDX();
    1510       14659 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1511       14659 :     CCInfo.AllocateReg(Reg);
    1512             :   }
    1513             : 
    1514       15458 :   if (Info.hasWorkGroupIDY()) {
    1515             :     unsigned Reg = Info.addWorkGroupIDY();
    1516          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1517          24 :     CCInfo.AllocateReg(Reg);
    1518             :   }
    1519             : 
    1520       15458 :   if (Info.hasWorkGroupIDZ()) {
    1521             :     unsigned Reg = Info.addWorkGroupIDZ();
    1522          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1523          24 :     CCInfo.AllocateReg(Reg);
    1524             :   }
    1525             : 
    1526       15458 :   if (Info.hasWorkGroupInfo()) {
    1527             :     unsigned Reg = Info.addWorkGroupInfo();
    1528           0 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1529           0 :     CCInfo.AllocateReg(Reg);
    1530             :   }
    1531             : 
    1532       15458 :   if (Info.hasPrivateSegmentWaveByteOffset()) {
    1533             :     // Scratch wave offset passed in system SGPR.
    1534             :     unsigned PrivateSegmentWaveByteOffsetReg;
    1535             : 
    1536       14707 :     if (IsShader) {
    1537             :       PrivateSegmentWaveByteOffsetReg =
    1538             :         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
    1539             : 
    1540             :       // This is true if the scratch wave byte offset doesn't have a fixed
    1541             :       // location.
    1542          48 :       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
    1543             :         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
    1544             :         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
    1545             :       }
    1546             :     } else
    1547             :       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
    1548             : 
    1549       14707 :     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
    1550       14707 :     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    1551             :   }
    1552       15458 : }
    1553             : 
    1554       15474 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
    1555             :                                      MachineFunction &MF,
    1556             :                                      const SIRegisterInfo &TRI,
    1557             :                                      SIMachineFunctionInfo &Info) {
    1558             :   // Now that we've figured out where the scratch register inputs are, see if
    1559             :   // should reserve the arguments and use them directly.
    1560             :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1561             :   bool HasStackObjects = MFI.hasStackObjects();
    1562             : 
    1563             :   // Record that we know we have non-spill stack objects so we don't need to
    1564             :   // check all stack objects later.
    1565       15474 :   if (HasStackObjects)
    1566             :     Info.setHasNonSpillStackObjects(true);
    1567             : 
    1568             :   // Everything live out of a block is spilled with fast regalloc, so it's
    1569             :   // almost certain that spilling will be required.
    1570       15474 :   if (TM.getOptLevel() == CodeGenOpt::None)
    1571             :     HasStackObjects = true;
    1572             : 
    1573             :   // For now assume stack access is needed in any callee functions, so we need
    1574             :   // the scratch registers to pass in.
    1575       15294 :   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
    1576             : 
    1577             :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1578       15474 :   if (ST.isAmdCodeObjectV2(MF)) {
    1579        1871 :     if (RequiresStackAccess) {
    1580             :       // If we have stack objects, we unquestionably need the private buffer
    1581             :       // resource. For the Code Object V2 ABI, this will be the first 4 user
    1582             :       // SGPR inputs. We can reserve those and use them directly.
    1583             : 
    1584             :       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
    1585             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
    1586             :       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
    1587             : 
    1588         446 :       if (MFI.hasCalls()) {
    1589             :         // If we have calls, we need to keep the frame register in a register
    1590             :         // that won't be clobbered by a call, so ensure it is copied somewhere.
    1591             : 
    1592             :         // This is not a problem for the scratch wave offset, because the same
    1593             :         // registers are reserved in all functions.
    1594             : 
    1595             :         // FIXME: Nothing is really ensuring this is a call preserved register,
    1596             :         // it's just selected from the end so it happens to be.
    1597             :         unsigned ReservedOffsetReg
    1598         233 :           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1599             :         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1600             :       } else {
    1601             :         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
    1602             :           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1603             :         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
    1604             :       }
    1605             :     } else {
    1606             :       unsigned ReservedBufferReg
    1607        1425 :         = TRI.reservedPrivateSegmentBufferReg(MF);
    1608             :       unsigned ReservedOffsetReg
    1609        1425 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1610             : 
    1611             :       // We tentatively reserve the last registers (skipping the last two
    1612             :       // which may contain VCC). After register allocation, we'll replace
    1613             :       // these with the ones immediately after those which were really
    1614             :       // allocated. In the prologue copies will be inserted from the argument
    1615             :       // to these reserved registers.
    1616             :       Info.setScratchRSrcReg(ReservedBufferReg);
    1617             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1618             :     }
    1619             :   } else {
    1620       13603 :     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
    1621             : 
    1622             :     // Without HSA, relocations are used for the scratch pointer and the
    1623             :     // buffer resource setup is always inserted in the prologue. Scratch wave
    1624             :     // offset is still in an input SGPR.
    1625             :     Info.setScratchRSrcReg(ReservedBufferReg);
    1626             : 
    1627       13603 :     if (HasStackObjects && !MFI.hasCalls()) {
    1628             :       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
    1629             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1630             :       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
    1631             :     } else {
    1632             :       unsigned ReservedOffsetReg
    1633       13320 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1634             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1635             :     }
    1636             :   }
    1637       15474 : }
    1638             : 
    1639       16448 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
    1640       16448 :   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1641       16448 :   return !Info->isEntryFunction();
    1642             : }
    1643             : 
    1644        1167 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
    1645             : 
    1646        1167 : }
    1647             : 
    1648        1167 : void SITargetLowering::insertCopiesSplitCSR(
    1649             :   MachineBasicBlock *Entry,
    1650             :   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
    1651        1167 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1652             : 
    1653        1167 :   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
    1654        1167 :   if (!IStart)
    1655        1167 :     return;
    1656             : 
    1657           0 :   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    1658             :   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
    1659           0 :   MachineBasicBlock::iterator MBBI = Entry->begin();
    1660           0 :   for (const MCPhysReg *I = IStart; *I; ++I) {
    1661             :     const TargetRegisterClass *RC = nullptr;
    1662           0 :     if (AMDGPU::SReg_64RegClass.contains(*I))
    1663             :       RC = &AMDGPU::SGPR_64RegClass;
    1664           0 :     else if (AMDGPU::SReg_32RegClass.contains(*I))
    1665             :       RC = &AMDGPU::SGPR_32RegClass;
    1666             :     else
    1667           0 :       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1668             : 
    1669           0 :     unsigned NewVR = MRI->createVirtualRegister(RC);
    1670             :     // Create copy from CSR to a virtual register.
    1671           0 :     Entry->addLiveIn(*I);
    1672           0 :     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
    1673           0 :       .addReg(*I);
    1674             : 
    1675             :     // Insert the copy-back instructions right before the terminator.
    1676           0 :     for (auto *Exit : Exits)
    1677           0 :       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
    1678           0 :               TII->get(TargetOpcode::COPY), *I)
    1679           0 :         .addReg(NewVR);
    1680             :   }
    1681             : }
    1682             : 
    1683       16628 : SDValue SITargetLowering::LowerFormalArguments(
    1684             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1685             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1686             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1687       16628 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1688             : 
    1689             :   MachineFunction &MF = DAG.getMachineFunction();
    1690             :   FunctionType *FType = MF.getFunction().getFunctionType();
    1691       16628 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1692             :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1693             : 
    1694       16628 :   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
    1695             :     const Function &Fn = MF.getFunction();
    1696             :     DiagnosticInfoUnsupported NoGraphicsHSA(
    1697           6 :         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    1698           3 :     DAG.getContext()->diagnose(NoGraphicsHSA);
    1699             :     return DAG.getEntryNode();
    1700             :   }
    1701             : 
    1702             :   // Create stack objects that are used for emitting debugger prologue if
    1703             :   // "amdgpu-debugger-emit-prologue" attribute was specified.
    1704       16625 :   if (ST.debuggerEmitPrologue())
    1705           4 :     createDebuggerPrologueStackObjects(MF);
    1706             : 
    1707             :   SmallVector<ISD::InputArg, 16> Splits;
    1708             :   SmallVector<CCValAssign, 16> ArgLocs;
    1709       16625 :   BitVector Skipped(Ins.size());
    1710             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1711       33250 :                  *DAG.getContext());
    1712             : 
    1713       16625 :   bool IsShader = AMDGPU::isShader(CallConv);
    1714             :   bool IsKernel = AMDGPU::isKernel(CallConv);
    1715       16625 :   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
    1716             : 
    1717       16625 :   if (!IsEntryFunc) {
    1718             :     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
    1719             :     // this when allocating argument fixed offsets.
    1720        1167 :     CCInfo.AllocateStack(4, 4);
    1721             :   }
    1722             : 
    1723       16625 :   if (IsShader) {
    1724         799 :     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
    1725             : 
    1726             :     // At least one interpolation mode must be enabled or else the GPU will
    1727             :     // hang.
    1728             :     //
    1729             :     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    1730             :     // set PSInputAddr, the user wants to enable some bits after the compilation
    1731             :     // based on run-time states. Since we can't know what the final PSInputEna
    1732             :     // will look like, so we shouldn't do anything here and the user should take
    1733             :     // responsibility for the correct programming.
    1734             :     //
    1735             :     // Otherwise, the following restrictions apply:
    1736             :     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    1737             :     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    1738             :     //   enabled too.
    1739         799 :     if (CallConv == CallingConv::AMDGPU_PS) {
    1740         946 :       if ((Info->getPSInputAddr() & 0x7F) == 0 ||
    1741         389 :            ((Info->getPSInputAddr() & 0xF) == 0 &&
    1742             :             Info->isPSInputAllocated(11))) {
    1743             :         CCInfo.AllocateReg(AMDGPU::VGPR0);
    1744             :         CCInfo.AllocateReg(AMDGPU::VGPR1);
    1745             :         Info->markPSInputAllocated(0);
    1746             :         Info->markPSInputEnabled(0);
    1747             :       }
    1748         560 :       if (Subtarget->isAmdPalOS()) {
    1749             :         // For isAmdPalOS, the user does not enable some bits after compilation
    1750             :         // based on run-time states; the register values being generated here are
    1751             :         // the final ones set in hardware. Therefore we need to apply the
    1752             :         // workaround to PSInputAddr and PSInputEnable together.  (The case where
    1753             :         // a bit is set in PSInputAddr but not PSInputEnable is where the
    1754             :         // frontend set up an input arg for a particular interpolation mode, but
    1755             :         // nothing uses that input arg. Really we should have an earlier pass
    1756             :         // that removes such an arg.)
    1757           6 :         unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
    1758           9 :         if ((PsInputBits & 0x7F) == 0 ||
    1759           3 :             ((PsInputBits & 0xF) == 0 &&
    1760             :              (PsInputBits >> 11 & 1)))
    1761           3 :           Info->markPSInputEnabled(
    1762             :               countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    1763             :       }
    1764             :     }
    1765             : 
    1766             :     assert(!Info->hasDispatchPtr() &&
    1767             :            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
    1768             :            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
    1769             :            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
    1770             :            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
    1771             :            !Info->hasWorkItemIDZ());
    1772       15826 :   } else if (IsKernel) {
    1773             :     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
    1774             :   } else {
    1775        1167 :     Splits.append(Ins.begin(), Ins.end());
    1776             :   }
    1777             : 
    1778       16625 :   if (IsEntryFunc) {
    1779       15458 :     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    1780       15458 :     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
    1781             :   }
    1782             : 
    1783       16625 :   if (IsKernel) {
    1784       14659 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1785             :   } else {
    1786        1966 :     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
    1787        1966 :     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
    1788             :   }
    1789             : 
    1790             :   SmallVector<SDValue, 16> Chains;
    1791             : 
    1792       57336 :   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    1793       40711 :     const ISD::InputArg &Arg = Ins[i];
    1794       41694 :     if (Skipped[i]) {
    1795        1966 :       InVals.push_back(DAG.getUNDEF(Arg.VT));
    1796       37128 :       continue;
    1797             :     }
    1798             : 
    1799       39728 :     CCValAssign &VA = ArgLocs[ArgIdx++];
    1800             :     MVT VT = VA.getLocVT();
    1801             : 
    1802       76199 :     if (IsEntryFunc && VA.isMemLoc()) {
    1803       34356 :       VT = Ins[i].VT;
    1804             :       EVT MemVT = VA.getLocVT();
    1805             : 
    1806       68712 :       const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
    1807       34356 :         VA.getLocMemOffset();
    1808       34356 :       Info->setABIArgOffset(Offset + MemVT.getStoreSize());
    1809             : 
    1810             :       // The first 36 bytes of the input buffer contains information about
    1811             :       // thread group and global sizes.
    1812             :       SDValue Arg = lowerKernargMemParameter(
    1813       68712 :         DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
    1814       34356 :       Chains.push_back(Arg.getValue(1));
    1815             : 
    1816             :       auto *ParamTy =
    1817             :         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
    1818       47630 :       if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    1819       42939 :           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    1820             :         // On SI local pointers are just offsets into LDS, so they are always
    1821             :         // less than 16-bits.  On CI and newer they could potentially be
    1822             :         // real pointers, so we can't guarantee their size.
    1823         572 :         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
    1824        1144 :                           DAG.getValueType(MVT::i16));
    1825             :       }
    1826             : 
    1827       34356 :       InVals.push_back(Arg);
    1828       34356 :       continue;
    1829        8629 :     } else if (!IsEntryFunc && VA.isMemLoc()) {
    1830         211 :       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
    1831         211 :       InVals.push_back(Val);
    1832         211 :       if (!Arg.Flags.isByVal())
    1833         151 :         Chains.push_back(Val.getValue(1));
    1834         211 :       continue;
    1835             :     }
    1836             : 
    1837             :     assert(VA.isRegLoc() && "Parameter must be in a register!");
    1838             : 
    1839             :     unsigned Reg = VA.getLocReg();
    1840        5161 :     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    1841             :     EVT ValVT = VA.getValVT();
    1842             : 
    1843        5161 :     Reg = MF.addLiveIn(Reg, RC);
    1844        5161 :     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1845             : 
    1846        5173 :     if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
    1847             :       // The return object should be reasonably addressable.
    1848             : 
    1849             :       // FIXME: This helps when the return is a real sret. If it is a
    1850             :       // automatically inserted sret (i.e. CanLowerReturn returns false), an
    1851             :       // extra copy is inserted in SelectionDAGBuilder which obscures this.
    1852          12 :       unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
    1853          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1854          24 :         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
    1855             :     }
    1856             : 
    1857             :     // If this is an 8 or 16-bit value, it is really passed promoted
    1858             :     // to 32 bits. Insert an assert[sz]ext to capture this, then
    1859             :     // truncate to the right size.
    1860        5161 :     switch (VA.getLocInfo()) {
    1861             :     case CCValAssign::Full:
    1862             :       break;
    1863             :     case CCValAssign::BCvt:
    1864           0 :       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
    1865           0 :       break;
    1866             :     case CCValAssign::SExt:
    1867           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
    1868          14 :                         DAG.getValueType(ValVT));
    1869           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1870           7 :       break;
    1871             :     case CCValAssign::ZExt:
    1872          10 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1873          20 :                         DAG.getValueType(ValVT));
    1874          10 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1875          10 :       break;
    1876             :     case CCValAssign::AExt:
    1877           6 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1878           6 :       break;
    1879           0 :     default:
    1880           0 :       llvm_unreachable("Unknown loc info!");
    1881             :     }
    1882             : 
    1883        7276 :     if (IsShader && Arg.VT.isVector()) {
    1884             :       // Build a vector from the registers
    1885             :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1886             :       unsigned NumElements = ParamType->getVectorNumElements();
    1887             : 
    1888             :       SmallVector<SDValue, 4> Regs;
    1889         595 :       Regs.push_back(Val);
    1890        4197 :       for (unsigned j = 1; j != NumElements; ++j) {
    1891        1801 :         Reg = ArgLocs[ArgIdx++].getLocReg();
    1892        1801 :         Reg = MF.addLiveIn(Reg, RC);
    1893             : 
    1894        1801 :         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1895        1801 :         Regs.push_back(Copy);
    1896             :       }
    1897             : 
    1898             :       // Fill up the missing vector elements
    1899         595 :       NumElements = Arg.VT.getVectorNumElements() - NumElements;
    1900         595 :       Regs.append(NumElements, DAG.getUNDEF(VT));
    1901             : 
    1902        1190 :       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
    1903             :       continue;
    1904             :     }
    1905             : 
    1906        4566 :     InVals.push_back(Val);
    1907             :   }
    1908             : 
    1909       16625 :   if (!IsEntryFunc) {
    1910             :     // Special inputs come after user arguments.
    1911        1167 :     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    1912             :   }
    1913             : 
    1914             :   // Start adding system SGPRs.
    1915       16625 :   if (IsEntryFunc) {
    1916       15458 :     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
    1917             :   } else {
    1918        1167 :     CCInfo.AllocateReg(Info->getScratchRSrcReg());
    1919        1167 :     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
    1920        1167 :     CCInfo.AllocateReg(Info->getFrameOffsetReg());
    1921        1167 :     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    1922             :   }
    1923             : 
    1924             :   auto &ArgUsageInfo =
    1925       16625 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1926       16625 :   ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
    1927             : 
    1928             :   unsigned StackArgSize = CCInfo.getNextStackOffset();
    1929             :   Info->setBytesInStackArgArea(StackArgSize);
    1930             : 
    1931             :   return Chains.empty() ? Chain :
    1932       30383 :     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    1933             : }
    1934             : 
    1935             : // TODO: If return values can't fit in registers, we should return as many as
    1936             : // possible in registers before passing on stack.
    1937       17105 : bool SITargetLowering::CanLowerReturn(
    1938             :   CallingConv::ID CallConv,
    1939             :   MachineFunction &MF, bool IsVarArg,
    1940             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1941             :   LLVMContext &Context) const {
    1942             :   // Replacing returns with sret/stack usage doesn't make sense for shaders.
    1943             :   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
    1944             :   // for shaders. Vector types should be explicitly handled by CC.
    1945       17105 :   if (AMDGPU::isEntryFunctionCC(CallConv))
    1946             :     return true;
    1947             : 
    1948             :   SmallVector<CCValAssign, 16> RVLocs;
    1949        3288 :   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
    1950        1644 :   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
    1951             : }
    1952             : 
    1953             : SDValue
    1954       16575 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    1955             :                               bool isVarArg,
    1956             :                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    1957             :                               const SmallVectorImpl<SDValue> &OutVals,
    1958             :                               const SDLoc &DL, SelectionDAG &DAG) const {
    1959             :   MachineFunction &MF = DAG.getMachineFunction();
    1960       16575 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1961             : 
    1962             :   if (AMDGPU::isKernel(CallConv)) {
    1963             :     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
    1964       14645 :                                              OutVals, DL, DAG);
    1965             :   }
    1966             : 
    1967        1930 :   bool IsShader = AMDGPU::isShader(CallConv);
    1968             : 
    1969        1930 :   Info->setIfReturnsVoid(Outs.size() == 0);
    1970        1930 :   bool IsWaveEnd = Info->returnsVoid() && IsShader;
    1971             : 
    1972             :   SmallVector<ISD::OutputArg, 48> Splits;
    1973             :   SmallVector<SDValue, 48> SplitVals;
    1974             : 
    1975             :   // Split vectors into their elements.
    1976        3456 :   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
    1977        1526 :     const ISD::OutputArg &Out = Outs[i];
    1978             : 
    1979        2325 :     if (IsShader && Out.VT.isVector()) {
    1980         146 :       MVT VT = Out.VT.getVectorElementType();
    1981         146 :       ISD::OutputArg NewOut = Out;
    1982             :       NewOut.Flags.setSplit();
    1983         146 :       NewOut.VT = VT;
    1984             : 
    1985             :       // We want the original number of vector elements here, e.g.
    1986             :       // three or five, not four or eight.
    1987         146 :       unsigned NumElements = Out.ArgVT.getVectorNumElements();
    1988             : 
    1989        1490 :       for (unsigned j = 0; j != NumElements; ++j) {
    1990             :         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
    1991        1344 :                                    DAG.getConstant(j, DL, MVT::i32));
    1992         672 :         SplitVals.push_back(Elem);
    1993         672 :         Splits.push_back(NewOut);
    1994         672 :         NewOut.PartOffset += NewOut.VT.getStoreSize();
    1995             :       }
    1996             :     } else {
    1997        1380 :       SplitVals.push_back(OutVals[i]);
    1998        1380 :       Splits.push_back(Out);
    1999             :     }
    2000             :   }
    2001             : 
    2002             :   // CCValAssign - represent the assignment of the return value to a location.
    2003             :   SmallVector<CCValAssign, 48> RVLocs;
    2004             : 
    2005             :   // CCState - Info about the registers and stack slots.
    2006             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
    2007        3860 :                  *DAG.getContext());
    2008             : 
    2009             :   // Analyze outgoing return values.
    2010        1930 :   CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
    2011             : 
    2012        1930 :   SDValue Flag;
    2013             :   SmallVector<SDValue, 48> RetOps;
    2014        1930 :   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    2015             : 
    2016             :   // Add return address for callable functions.
    2017        1930 :   if (!Info->isEntryFunction()) {
    2018        1129 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2019             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2020        2258 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2021             : 
    2022             :     // FIXME: Should be able to use a vreg here, but need a way to prevent it
    2023             :     // from being allcoated to a CSR.
    2024             : 
    2025             :     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2026        1129 :                                                 MVT::i64);
    2027             : 
    2028        1129 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
    2029        1129 :     Flag = Chain.getValue(1);
    2030             : 
    2031        1129 :     RetOps.push_back(PhysReturnAddrReg);
    2032             :   }
    2033             : 
    2034             :   // Copy the result values into the output registers.
    2035        2052 :   for (unsigned i = 0, realRVLocIdx = 0;
    2036        7964 :        i != RVLocs.size();
    2037             :        ++i, ++realRVLocIdx) {
    2038             :     CCValAssign &VA = RVLocs[i];
    2039             :     assert(VA.isRegLoc() && "Can only return in registers!");
    2040             :     // TODO: Partially return in registers if return values don't fit.
    2041             : 
    2042        2052 :     SDValue Arg = SplitVals[realRVLocIdx];
    2043             : 
    2044             :     // Copied from other backends.
    2045        2052 :     switch (VA.getLocInfo()) {
    2046             :     case CCValAssign::Full:
    2047             :       break;
    2048             :     case CCValAssign::BCvt:
    2049           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2050           0 :       break;
    2051             :     case CCValAssign::SExt:
    2052           0 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2053           0 :       break;
    2054             :     case CCValAssign::ZExt:
    2055           0 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2056           0 :       break;
    2057             :     case CCValAssign::AExt:
    2058           3 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2059           3 :       break;
    2060           0 :     default:
    2061           0 :       llvm_unreachable("Unknown loc info!");
    2062             :     }
    2063             : 
    2064        2052 :     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
    2065        2052 :     Flag = Chain.getValue(1);
    2066        2052 :     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    2067             :   }
    2068             : 
    2069             :   // FIXME: Does sret work properly?
    2070        1930 :   if (!Info->isEntryFunction()) {
    2071             :     const SIRegisterInfo *TRI
    2072        1129 :       = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
    2073             :     const MCPhysReg *I =
    2074        1129 :       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
    2075        1129 :     if (I) {
    2076           0 :       for (; *I; ++I) {
    2077           0 :         if (AMDGPU::SReg_64RegClass.contains(*I))
    2078           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
    2079           0 :         else if (AMDGPU::SReg_32RegClass.contains(*I))
    2080           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
    2081             :         else
    2082           0 :           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    2083             :       }
    2084             :     }
    2085             :   }
    2086             : 
    2087             :   // Update chain and glue.
    2088        1930 :   RetOps[0] = Chain;
    2089        1930 :   if (Flag.getNode())
    2090        1538 :     RetOps.push_back(Flag);
    2091             : 
    2092             :   unsigned Opc = AMDGPUISD::ENDPGM;
    2093        1930 :   if (!IsWaveEnd)
    2094        1538 :     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
    2095        1930 :   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
    2096             : }
    2097             : 
    2098         437 : SDValue SITargetLowering::LowerCallResult(
    2099             :     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
    2100             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    2101             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    2102             :     SDValue ThisVal) const {
    2103         437 :   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
    2104             : 
    2105             :   // Assign locations to each value returned by this call.
    2106             :   SmallVector<CCValAssign, 16> RVLocs;
    2107             :   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    2108         874 :                  *DAG.getContext());
    2109         437 :   CCInfo.AnalyzeCallResult(Ins, RetCC);
    2110             : 
    2111             :   // Copy all of the result registers out of their specified physreg.
    2112        1216 :   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    2113         114 :     CCValAssign VA = RVLocs[i];
    2114         114 :     SDValue Val;
    2115             : 
    2116         114 :     if (VA.isRegLoc()) {
    2117         114 :       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
    2118             :       Chain = Val.getValue(1);
    2119         114 :       InFlag = Val.getValue(2);
    2120             :     } else if (VA.isMemLoc()) {
    2121           0 :       report_fatal_error("TODO: return values in memory");
    2122             :     } else
    2123             :       llvm_unreachable("unknown argument location type");
    2124             : 
    2125         114 :     switch (VA.getLocInfo()) {
    2126             :     case CCValAssign::Full:
    2127             :       break;
    2128             :     case CCValAssign::BCvt:
    2129           0 :       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
    2130           0 :       break;
    2131             :     case CCValAssign::ZExt:
    2132           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
    2133          14 :                         DAG.getValueType(VA.getValVT()));
    2134           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2135           7 :       break;
    2136             :     case CCValAssign::SExt:
    2137           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
    2138          14 :                         DAG.getValueType(VA.getValVT()));
    2139           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2140           7 :       break;
    2141             :     case CCValAssign::AExt:
    2142           3 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2143           3 :       break;
    2144           0 :     default:
    2145           0 :       llvm_unreachable("Unknown loc info!");
    2146             :     }
    2147             : 
    2148         114 :     InVals.push_back(Val);
    2149             :   }
    2150             : 
    2151         874 :   return Chain;
    2152             : }
    2153             : 
    2154             : // Add code to pass special inputs required depending on used features separate
    2155             : // from the explicit user arguments present in the IR.
    2156         471 : void SITargetLowering::passSpecialInputs(
    2157             :     CallLoweringInfo &CLI,
    2158             :     const SIMachineFunctionInfo &Info,
    2159             :     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    2160             :     SmallVectorImpl<SDValue> &MemOpChains,
    2161             :     SDValue Chain,
    2162             :     SDValue StackPtr) const {
    2163             :   // If we don't have a call site, this was a call inserted by
    2164             :   // legalization. These can never use special inputs.
    2165         471 :   if (!CLI.CS)
    2166           0 :     return;
    2167             : 
    2168             :   const Function *CalleeFunc = CLI.CS.getCalledFunction();
    2169             :   assert(CalleeFunc);
    2170             : 
    2171         471 :   SelectionDAG &DAG = CLI.DAG;
    2172         471 :   const SDLoc &DL = CLI.DL;
    2173             : 
    2174         471 :   const SISubtarget *ST = getSubtarget();
    2175             :   const SIRegisterInfo *TRI = ST->getRegisterInfo();
    2176             : 
    2177             :   auto &ArgUsageInfo =
    2178         471 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    2179             :   const AMDGPUFunctionArgInfo &CalleeArgInfo
    2180             :     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
    2181             : 
    2182             :   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
    2183             : 
    2184             :   // TODO: Unify with private memory register handling. This is complicated by
    2185             :   // the fact that at least in kernels, the input argument is not necessarily
    2186             :   // in the same location as the input.
    2187         471 :   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    2188             :     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    2189             :     AMDGPUFunctionArgInfo::QUEUE_PTR,
    2190             :     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
    2191             :     AMDGPUFunctionArgInfo::DISPATCH_ID,
    2192             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    2193             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    2194             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
    2195             :     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
    2196             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
    2197             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
    2198             :     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    2199             :   };
    2200             : 
    2201       10833 :   for (auto InputID : InputRegs) {
    2202             :     const ArgDescriptor *OutgoingArg;
    2203             :     const TargetRegisterClass *ArgRC;
    2204             : 
    2205       10362 :     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
    2206        5181 :     if (!OutgoingArg)
    2207        5061 :       continue;
    2208             : 
    2209             :     const ArgDescriptor *IncomingArg;
    2210             :     const TargetRegisterClass *IncomingArgRC;
    2211             :     std::tie(IncomingArg, IncomingArgRC)
    2212         240 :       = CallerArgInfo.getPreloadedValue(InputID);
    2213             :     assert(IncomingArgRC == ArgRC);
    2214             : 
    2215             :     // All special arguments are ints for now.
    2216         120 :     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
    2217         120 :     SDValue InputReg;
    2218             : 
    2219         120 :     if (IncomingArg) {
    2220         111 :       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
    2221             :     } else {
    2222             :       // The implicit arg ptr is special because it doesn't have a corresponding
    2223             :       // input for kernels, and is computed from the kernarg segment pointer.
    2224             :       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    2225           9 :       InputReg = getImplicitArgPtr(DAG, DL);
    2226             :     }
    2227             : 
    2228         240 :     if (OutgoingArg->isRegister()) {
    2229         110 :       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
    2230             :     } else {
    2231             :       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
    2232             :                                               InputReg,
    2233          10 :                                               OutgoingArg->getStackOffset());
    2234          10 :       MemOpChains.push_back(ArgStore);
    2235             :     }
    2236             :   }
    2237             : }
    2238             : 
    2239             : static bool canGuaranteeTCO(CallingConv::ID CC) {
    2240          39 :   return CC == CallingConv::Fast;
    2241             : }
    2242             : 
    2243             : /// Return true if we might ever do TCO for calls with this calling convention.
    2244             : static bool mayTailCallThisCC(CallingConv::ID CC) {
    2245          43 :   switch (CC) {
    2246             :   case CallingConv::C:
    2247             :     return true;
    2248             :   default:
    2249             :     return canGuaranteeTCO(CC);
    2250             :   }
    2251             : }
    2252             : 
    2253          43 : bool SITargetLowering::isEligibleForTailCallOptimization(
    2254             :     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    2255             :     const SmallVectorImpl<ISD::OutputArg> &Outs,
    2256             :     const SmallVectorImpl<SDValue> &OutVals,
    2257             :     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    2258          39 :   if (!mayTailCallThisCC(CalleeCC))
    2259             :     return false;
    2260             : 
    2261             :   MachineFunction &MF = DAG.getMachineFunction();
    2262             :   const Function &CallerF = MF.getFunction();
    2263             :   CallingConv::ID CallerCC = CallerF.getCallingConv();
    2264          43 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2265          43 :   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    2266             : 
    2267             :   // Kernels aren't callable, and don't have a live in return address so it
    2268             :   // doesn't make sense to do a tail call with entry functions.
    2269          43 :   if (!CallerPreserved)
    2270             :     return false;
    2271             : 
    2272             :   bool CCMatch = CallerCC == CalleeCC;
    2273             : 
    2274          40 :   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    2275           0 :     if (canGuaranteeTCO(CalleeCC) && CCMatch)
    2276             :       return true;
    2277             :     return false;
    2278             :   }
    2279             : 
    2280             :   // TODO: Can we handle var args?
    2281          40 :   if (IsVarArg)
    2282             :     return false;
    2283             : 
    2284         136 :   for (const Argument &Arg : CallerF.args()) {
    2285          99 :     if (Arg.hasByValAttr())
    2286             :       return false;
    2287             :   }
    2288             : 
    2289             :   LLVMContext &Ctx = *DAG.getContext();
    2290             : 
    2291             :   // Check that the call results are passed in the same way.
    2292          37 :   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
    2293             :                                   CCAssignFnForCall(CalleeCC, IsVarArg),
    2294             :                                   CCAssignFnForCall(CallerCC, IsVarArg)))
    2295             :     return false;
    2296             : 
    2297             :   // The callee has to preserve all registers the caller needs to preserve.
    2298          37 :   if (!CCMatch) {
    2299           0 :     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    2300           0 :     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    2301             :       return false;
    2302             :   }
    2303             : 
    2304             :   // Nothing more to check if the callee is taking no arguments.
    2305          37 :   if (Outs.empty())
    2306             :     return true;
    2307             : 
    2308             :   SmallVector<CCValAssign, 16> ArgLocs;
    2309          66 :   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
    2310             : 
    2311          33 :   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
    2312             : 
    2313          33 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    2314             :   // If the stack arguments for this call do not fit into our own save area then
    2315             :   // the call cannot be made tail.
    2316             :   // TODO: Is this really necessary?
    2317          33 :   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
    2318             :     return false;
    2319             : 
    2320             :   const MachineRegisterInfo &MRI = MF.getRegInfo();
    2321          30 :   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
    2322             : }
    2323             : 
    2324          14 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
    2325          14 :   if (!CI->isTailCall())
    2326             :     return false;
    2327             : 
    2328             :   const Function *ParentFn = CI->getParent()->getParent();
    2329           4 :   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
    2330             :     return false;
    2331             : 
    2332           1 :   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
    2333           1 :   return (Attr.getValueAsString() != "true");
    2334             : }
    2335             : 
    2336             : // The wave scratch offset register is used as the global base pointer.
    2337         477 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    2338             :                                     SmallVectorImpl<SDValue> &InVals) const {
    2339         477 :   SelectionDAG &DAG = CLI.DAG;
    2340         477 :   const SDLoc &DL = CLI.DL;
    2341             :   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    2342             :   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    2343             :   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
    2344         477 :   SDValue Chain = CLI.Chain;
    2345         477 :   SDValue Callee = CLI.Callee;
    2346             :   bool &IsTailCall = CLI.IsTailCall;
    2347         477 :   CallingConv::ID CallConv = CLI.CallConv;
    2348         477 :   bool IsVarArg = CLI.IsVarArg;
    2349             :   bool IsSibCall = false;
    2350             :   bool IsThisReturn = false;
    2351             :   MachineFunction &MF = DAG.getMachineFunction();
    2352             : 
    2353         477 :   if (IsVarArg) {
    2354             :     return lowerUnhandledCall(CLI, InVals,
    2355           2 :                               "unsupported call to variadic function ");
    2356             :   }
    2357             : 
    2358             :   if (!CLI.CS.getCalledFunction()) {
    2359             :     return lowerUnhandledCall(CLI, InVals,
    2360           8 :                               "unsupported indirect call to function ");
    2361             :   }
    2362             : 
    2363         472 :   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
    2364             :     return lowerUnhandledCall(CLI, InVals,
    2365           2 :                               "unsupported required tail call to function ");
    2366             :   }
    2367             : 
    2368             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2369             :   const unsigned CalleeUsableStackOffset = 4;
    2370             : 
    2371         471 :   if (IsTailCall) {
    2372          43 :     IsTailCall = isEligibleForTailCallOptimization(
    2373             :       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
    2374          52 :     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
    2375           0 :       report_fatal_error("failed to perform tail call elimination on a call "
    2376             :                          "site marked musttail");
    2377             :     }
    2378             : 
    2379          43 :     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
    2380             : 
    2381             :     // A sibling call is one where we're under the usual C ABI and not planning
    2382             :     // to change that but can still do a tail call:
    2383          86 :     if (!TailCallOpt && IsTailCall)
    2384             :       IsSibCall = true;
    2385             : 
    2386             :     if (IsTailCall)
    2387             :       ++NumTailCalls;
    2388             :   }
    2389             : 
    2390             :   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
    2391             :     // FIXME: Remove this hack for function pointer types after removing
    2392             :     // support of old address space mapping. In the new address space
    2393             :     // mapping the pointer in default address space is 64 bit, therefore
    2394             :     // does not need this hack.
    2395             :     if (Callee.getValueType() == MVT::i32) {
    2396             :       const GlobalValue *GV = GA->getGlobal();
    2397           0 :       Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
    2398           0 :                                     GA->getTargetFlags());
    2399             :     }
    2400             :   }
    2401             :   assert(Callee.getValueType() == MVT::i64);
    2402             : 
    2403         471 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2404             : 
    2405             :   // Analyze operands of the call, assigning locations to each operand.
    2406             :   SmallVector<CCValAssign, 16> ArgLocs;
    2407         942 :   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    2408         471 :   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
    2409         471 :   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
    2410             : 
    2411             :   // Get a count of how many bytes are to be pushed on the stack.
    2412             :   unsigned NumBytes = CCInfo.getNextStackOffset();
    2413             : 
    2414         471 :   if (IsSibCall) {
    2415             :     // Since we're not changing the ABI to make this a tail call, the memory
    2416             :     // operands are already available in the caller's incoming argument space.
    2417             :     NumBytes = 0;
    2418             :   }
    2419             : 
    2420             :   // FPDiff is the byte offset of the call's argument area from the callee's.
    2421             :   // Stores to callee stack arguments will be placed in FixedStackSlots offset
    2422             :   // by this amount for a tail call. In a sibling call it must be 0 because the
    2423             :   // caller will deallocate the entire stack and the callee still expects its
    2424             :   // arguments to begin at SP+0. Completely unused for non-tail calls.
    2425             :   int32_t FPDiff = 0;
    2426             :   MachineFrameInfo &MFI = MF.getFrameInfo();
    2427             :   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    2428             : 
    2429         471 :   SDValue CallerSavedFP;
    2430             : 
    2431             :   // Adjust the stack pointer for the new arguments...
    2432             :   // These operations are automatically eliminated by the prolog/epilog pass
    2433         471 :   if (!IsSibCall) {
    2434         437 :     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
    2435             : 
    2436             :     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
    2437             : 
    2438             :     // In the HSA case, this should be an identity copy.
    2439             :     SDValue ScratchRSrcReg
    2440         437 :       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
    2441         437 :     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
    2442             : 
    2443             :     // TODO: Don't hardcode these registers and get from the callee function.
    2444             :     SDValue ScratchWaveOffsetReg
    2445         437 :       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
    2446         437 :     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
    2447             : 
    2448         437 :     if (!Info->isEntryFunction()) {
    2449             :       // Avoid clobbering this function's FP value. In the current convention
    2450             :       // callee will overwrite this, so do save/restore around the call site.
    2451          96 :       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
    2452          96 :                                          Info->getFrameOffsetReg(), MVT::i32);
    2453             :     }
    2454             :   }
    2455             : 
    2456             :   // Stack pointer relative accesses are done by changing the offset SGPR. This
    2457             :   // is just the VGPR offset component.
    2458         471 :   SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
    2459             : 
    2460             :   SmallVector<SDValue, 8> MemOpChains;
    2461             :   MVT PtrVT = MVT::i32;
    2462             : 
    2463             :   // Walk the register/memloc assignments, inserting copies/loads.
    2464        1526 :   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
    2465             :        ++i, ++realArgIdx) {
    2466        1055 :     CCValAssign &VA = ArgLocs[i];
    2467        1055 :     SDValue Arg = OutVals[realArgIdx];
    2468             : 
    2469             :     // Promote the value if needed.
    2470        1055 :     switch (VA.getLocInfo()) {
    2471             :     case CCValAssign::Full:
    2472             :       break;
    2473             :     case CCValAssign::BCvt:
    2474           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2475           0 :       break;
    2476             :     case CCValAssign::ZExt:
    2477          10 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2478          10 :       break;
    2479             :     case CCValAssign::SExt:
    2480          10 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2481          10 :       break;
    2482             :     case CCValAssign::AExt:
    2483           4 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2484           4 :       break;
    2485             :     case CCValAssign::FPExt:
    2486           0 :       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
    2487           0 :       break;
    2488           0 :     default:
    2489           0 :       llvm_unreachable("Unknown loc info!");
    2490             :     }
    2491             : 
    2492        1055 :     if (VA.isRegLoc()) {
    2493         993 :       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    2494             :     } else {
    2495             :       assert(VA.isMemLoc());
    2496             : 
    2497          62 :       SDValue DstAddr;
    2498             :       MachinePointerInfo DstInfo;
    2499             : 
    2500             :       unsigned LocMemOffset = VA.getLocMemOffset();
    2501          62 :       int32_t Offset = LocMemOffset;
    2502             : 
    2503          62 :       SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
    2504             : 
    2505          62 :       if (IsTailCall) {
    2506          27 :         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
    2507          27 :         unsigned OpSize = Flags.isByVal() ?
    2508          51 :           Flags.getByValSize() : VA.getValVT().getStoreSize();
    2509             : 
    2510             :         Offset = Offset + FPDiff;
    2511          27 :         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
    2512             : 
    2513          27 :         DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
    2514          54 :                                          StackPtr);
    2515          27 :         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
    2516             : 
    2517             :         // Make sure any stack arguments overlapping with where we're storing
    2518             :         // are loaded before this eventual operation. Otherwise they'll be
    2519             :         // clobbered.
    2520             : 
    2521             :         // FIXME: Why is this really necessary? This seems to just result in a
    2522             :         // lot of code to copy the stack and write them back to the same
    2523             :         // locations, which are supposed to be immutable?
    2524          27 :         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
    2525             :       } else {
    2526          35 :         DstAddr = PtrOff;
    2527          35 :         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
    2528             :       }
    2529             : 
    2530          62 :       if (Outs[i].Flags.isByVal()) {
    2531             :         SDValue SizeNode =
    2532          28 :             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
    2533             :         SDValue Cpy = DAG.getMemcpy(
    2534             :             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
    2535             :             /*isVol = */ false, /*AlwaysInline = */ true,
    2536             :             /*isTailCall = */ false, DstInfo,
    2537          28 :             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
    2538          56 :                 *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
    2539             : 
    2540          28 :         MemOpChains.push_back(Cpy);
    2541             :       } else {
    2542          34 :         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
    2543          34 :         MemOpChains.push_back(Store);
    2544             :       }
    2545             :     }
    2546             :   }
    2547             : 
    2548             :   // Copy special input registers after user input arguments.
    2549         471 :   passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
    2550             : 
    2551         471 :   if (!MemOpChains.empty())
    2552          46 :     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    2553             : 
    2554             :   // Build a sequence of copy-to-reg nodes chained together with token chain
    2555             :   // and flag operands which copy the outgoing args into the appropriate regs.
    2556         471 :   SDValue InFlag;
    2557        4425 :   for (auto &RegToPass : RegsToPass) {
    2558        1977 :     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
    2559        1977 :                              RegToPass.second, InFlag);
    2560        1977 :     InFlag = Chain.getValue(1);
    2561             :   }
    2562             : 
    2563             : 
    2564         471 :   SDValue PhysReturnAddrReg;
    2565         471 :   if (IsTailCall) {
    2566             :     // Since the return is being combined with the call, we need to pass on the
    2567             :     // return address.
    2568             : 
    2569          34 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2570             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2571          68 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2572             : 
    2573          34 :     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2574          34 :                                         MVT::i64);
    2575          34 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
    2576          34 :     InFlag = Chain.getValue(1);
    2577             :   }
    2578             : 
    2579             :   // We don't usually want to end the call-sequence here because we would tidy
    2580             :   // the frame up *after* the call, however in the ABI-changing tail-call case
    2581             :   // we've carefully laid out the parameters so that when sp is reset they'll be
    2582             :   // in the correct location.
    2583         471 :   if (IsTailCall && !IsSibCall) {
    2584           0 :     Chain = DAG.getCALLSEQ_END(Chain,
    2585             :                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2586             :                                DAG.getTargetConstant(0, DL, MVT::i32),
    2587           0 :                                InFlag, DL);
    2588           0 :     InFlag = Chain.getValue(1);
    2589             :   }
    2590             : 
    2591             :   std::vector<SDValue> Ops;
    2592         471 :   Ops.push_back(Chain);
    2593         471 :   Ops.push_back(Callee);
    2594             : 
    2595         471 :   if (IsTailCall) {
    2596             :     // Each tail call may have to adjust the stack by a different amount, so
    2597             :     // this information must travel along with the operation for eventual
    2598             :     // consumption by emitEpilogue.
    2599          68 :     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
    2600             : 
    2601          34 :     Ops.push_back(PhysReturnAddrReg);
    2602             :   }
    2603             : 
    2604             :   // Add argument registers to the end of the list so that they are known live
    2605             :   // into the call.
    2606        4425 :   for (auto &RegToPass : RegsToPass) {
    2607        3954 :     Ops.push_back(DAG.getRegister(RegToPass.first,
    2608        1977 :                                   RegToPass.second.getValueType()));
    2609             :   }
    2610             : 
    2611             :   // Add a register mask operand representing the call-preserved registers.
    2612             : 
    2613         471 :   const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
    2614         471 :   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
    2615             :   assert(Mask && "Missing call preserved mask for calling convention");
    2616         942 :   Ops.push_back(DAG.getRegisterMask(Mask));
    2617             : 
    2618         471 :   if (InFlag.getNode())
    2619         471 :     Ops.push_back(InFlag);
    2620             : 
    2621         471 :   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    2622             : 
    2623             :   // If we're doing a tall call, use a TC_RETURN here rather than an
    2624             :   // actual call instruction.
    2625         471 :   if (IsTailCall) {
    2626             :     MFI.setHasTailCall();
    2627          34 :     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
    2628             :   }
    2629             : 
    2630             :   // Returns a chain and a flag for retval copy to use.
    2631         437 :   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
    2632         437 :   Chain = Call.getValue(0);
    2633         437 :   InFlag = Call.getValue(1);
    2634             : 
    2635         437 :   if (CallerSavedFP) {
    2636          96 :     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
    2637          96 :     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
    2638          96 :     InFlag = Chain.getValue(1);
    2639             :   }
    2640             : 
    2641         437 :   uint64_t CalleePopBytes = NumBytes;
    2642         437 :   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
    2643             :                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
    2644             :                              InFlag, DL);
    2645         437 :   if (!Ins.empty())
    2646         104 :     InFlag = Chain.getValue(1);
    2647             : 
    2648             :   // Handle result values, copying them out of physregs into vregs that we
    2649             :   // return.
    2650             :   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
    2651             :                          InVals, IsThisReturn,
    2652         437 :                          IsThisReturn ? OutVals[0] : SDValue());
    2653             : }
    2654             : 
    2655          27 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
    2656             :                                              SelectionDAG &DAG) const {
    2657             :   unsigned Reg = StringSwitch<unsigned>(RegName)
    2658          54 :     .Case("m0", AMDGPU::M0)
    2659          54 :     .Case("exec", AMDGPU::EXEC)
    2660          54 :     .Case("exec_lo", AMDGPU::EXEC_LO)
    2661          54 :     .Case("exec_hi", AMDGPU::EXEC_HI)
    2662          54 :     .Case("flat_scratch", AMDGPU::FLAT_SCR)
    2663          54 :     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
    2664          54 :     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
    2665             :     .Default(AMDGPU::NoRegister);
    2666             : 
    2667          27 :   if (Reg == AMDGPU::NoRegister) {
    2668           0 :     report_fatal_error(Twine("invalid register name \""
    2669             :                              + StringRef(RegName)  + "\"."));
    2670             : 
    2671             :   }
    2672             : 
    2673          30 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    2674           3 :       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
    2675           1 :     report_fatal_error(Twine("invalid register \""
    2676             :                              + StringRef(RegName)  + "\" for subtarget."));
    2677             :   }
    2678             : 
    2679          26 :   switch (Reg) {
    2680          17 :   case AMDGPU::M0:
    2681             :   case AMDGPU::EXEC_LO:
    2682             :   case AMDGPU::EXEC_HI:
    2683             :   case AMDGPU::FLAT_SCR_LO:
    2684             :   case AMDGPU::FLAT_SCR_HI:
    2685          17 :     if (VT.getSizeInBits() == 32)
    2686             :       return Reg;
    2687             :     break;
    2688           9 :   case AMDGPU::EXEC:
    2689             :   case AMDGPU::FLAT_SCR:
    2690           9 :     if (VT.getSizeInBits() == 64)
    2691             :       return Reg;
    2692             :     break;
    2693           0 :   default:
    2694           0 :     llvm_unreachable("missing register type checking");
    2695             :   }
    2696             : 
    2697           2 :   report_fatal_error(Twine("invalid type for register \""
    2698             :                            + StringRef(RegName) + "\"."));
    2699             : }
    2700             : 
    2701             : // If kill is not the last instruction, split the block so kill is always a
    2702             : // proper terminator.
    2703          86 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
    2704             :                                                     MachineBasicBlock *BB) const {
    2705          86 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2706             : 
    2707             :   MachineBasicBlock::iterator SplitPoint(&MI);
    2708             :   ++SplitPoint;
    2709             : 
    2710          86 :   if (SplitPoint == BB->end()) {
    2711             :     // Don't bother with a new block.
    2712           4 :     MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2713           4 :     return BB;
    2714             :   }
    2715             : 
    2716             :   MachineFunction *MF = BB->getParent();
    2717             :   MachineBasicBlock *SplitBB
    2718          82 :     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
    2719             : 
    2720             :   MF->insert(++MachineFunction::iterator(BB), SplitBB);
    2721             :   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
    2722             : 
    2723          82 :   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
    2724          82 :   BB->addSuccessor(SplitBB);
    2725             : 
    2726          82 :   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2727          82 :   return SplitBB;
    2728             : }
    2729             : 
    2730             : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
    2731             : // wavefront. If the value is uniform and just happens to be in a VGPR, this
    2732             : // will only do one iteration. In the worst case, this will loop 64 times.
    2733             : //
    2734             : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
    2735          32 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
    2736             :   const SIInstrInfo *TII,
    2737             :   MachineRegisterInfo &MRI,
    2738             :   MachineBasicBlock &OrigBB,
    2739             :   MachineBasicBlock &LoopBB,
    2740             :   const DebugLoc &DL,
    2741             :   const MachineOperand &IdxReg,
    2742             :   unsigned InitReg,
    2743             :   unsigned ResultReg,
    2744             :   unsigned PhiReg,
    2745             :   unsigned InitSaveExecReg,
    2746             :   int Offset,
    2747             :   bool UseGPRIdxMode,
    2748             :   bool IsIndirectSrc) {
    2749          32 :   MachineBasicBlock::iterator I = LoopBB.begin();
    2750             : 
    2751          32 :   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2752          32 :   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2753          32 :   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2754          32 :   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2755             : 
    2756          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
    2757          32 :     .addReg(InitReg)
    2758             :     .addMBB(&OrigBB)
    2759          32 :     .addReg(ResultReg)
    2760             :     .addMBB(&LoopBB);
    2761             : 
    2762          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
    2763          32 :     .addReg(InitSaveExecReg)
    2764             :     .addMBB(&OrigBB)
    2765          32 :     .addReg(NewExec)
    2766             :     .addMBB(&LoopBB);
    2767             : 
    2768             :   // Read the next variant <- also loop target.
    2769          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
    2770          32 :     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    2771             : 
    2772             :   // Compare the just read M0 value to all possible Idx values.
    2773          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
    2774          32 :     .addReg(CurrentIdxReg)
    2775          32 :     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
    2776             : 
    2777             :   // Update EXEC, save the original EXEC value to VCC.
    2778          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
    2779          32 :     .addReg(CondReg, RegState::Kill);
    2780             : 
    2781          32 :   MRI.setSimpleHint(NewExec, CondReg);
    2782             : 
    2783          32 :   if (UseGPRIdxMode) {
    2784             :     unsigned IdxReg;
    2785          16 :     if (Offset == 0) {
    2786             :       IdxReg = CurrentIdxReg;
    2787             :     } else {
    2788           6 :       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2789          12 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
    2790           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2791           6 :         .addImm(Offset);
    2792             :     }
    2793          16 :     unsigned IdxMode = IsIndirectSrc ?
    2794             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2795             :     MachineInstr *SetOn =
    2796          32 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2797          16 :       .addReg(IdxReg, RegState::Kill)
    2798          16 :       .addImm(IdxMode);
    2799             :     SetOn->getOperand(3).setIsUndef();
    2800             :   } else {
    2801             :     // Move index from VCC into M0
    2802          16 :     if (Offset == 0) {
    2803          20 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2804          10 :         .addReg(CurrentIdxReg, RegState::Kill);
    2805             :     } else {
    2806          12 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2807           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2808           6 :         .addImm(Offset);
    2809             :     }
    2810             :   }
    2811             : 
    2812             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    2813             :   MachineInstr *InsertPt =
    2814          64 :     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    2815          32 :     .addReg(AMDGPU::EXEC)
    2816          32 :     .addReg(NewExec);
    2817             : 
    2818             :   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
    2819             :   // s_cbranch_scc0?
    2820             : 
    2821             :   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
    2822          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    2823             :     .addMBB(&LoopBB);
    2824             : 
    2825          32 :   return InsertPt->getIterator();
    2826             : }
    2827             : 
    2828             : // This has slightly sub-optimal regalloc when the source vector is killed by
    2829             : // the read. The register allocator does not understand that the kill is
    2830             : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
    2831             : // subregister from it, using 1 more VGPR than necessary. This was saved when
    2832             : // this was expanded after register allocation.
    2833          32 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    2834             :                                                   MachineBasicBlock &MBB,
    2835             :                                                   MachineInstr &MI,
    2836             :                                                   unsigned InitResultReg,
    2837             :                                                   unsigned PhiReg,
    2838             :                                                   int Offset,
    2839             :                                                   bool UseGPRIdxMode,
    2840             :                                                   bool IsIndirectSrc) {
    2841             :   MachineFunction *MF = MBB.getParent();
    2842             :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2843             :   const DebugLoc &DL = MI.getDebugLoc();
    2844             :   MachineBasicBlock::iterator I(&MI);
    2845             : 
    2846             :   unsigned DstReg = MI.getOperand(0).getReg();
    2847          32 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2848          32 :   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2849             : 
    2850          32 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
    2851             : 
    2852             :   // Save the EXEC mask
    2853          64 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
    2854          32 :     .addReg(AMDGPU::EXEC);
    2855             : 
    2856             :   // To insert the loop we need to split the block. Move everything after this
    2857             :   // point to a new block, and insert a new empty block between the two.
    2858          32 :   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    2859          32 :   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    2860             :   MachineFunction::iterator MBBI(MBB);
    2861             :   ++MBBI;
    2862             : 
    2863             :   MF->insert(MBBI, LoopBB);
    2864             :   MF->insert(MBBI, RemainderBB);
    2865             : 
    2866          32 :   LoopBB->addSuccessor(LoopBB);
    2867          32 :   LoopBB->addSuccessor(RemainderBB);
    2868             : 
    2869             :   // Move the rest of the block into a new block.
    2870          32 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    2871             :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    2872             : 
    2873          32 :   MBB.addSuccessor(LoopBB);
    2874             : 
    2875          32 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2876             : 
    2877             :   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
    2878             :                                       InitResultReg, DstReg, PhiReg, TmpExec,
    2879          32 :                                       Offset, UseGPRIdxMode, IsIndirectSrc);
    2880             : 
    2881          32 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    2882          64 :   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    2883          32 :     .addReg(SaveExec);
    2884             : 
    2885          32 :   return InsPt;
    2886             : }
    2887             : 
    2888             : // Returns subreg index, offset
    2889             : static std::pair<unsigned, int>
    2890         157 : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
    2891             :                             const TargetRegisterClass *SuperRC,
    2892             :                             unsigned VecReg,
    2893             :                             int Offset) {
    2894         157 :   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
    2895             : 
    2896             :   // Skip out of bounds offsets, or else we would end up using an undefined
    2897             :   // register.
    2898         157 :   if (Offset >= NumElts || Offset < 0)
    2899          40 :     return std::make_pair(AMDGPU::sub0, Offset);
    2900             : 
    2901         234 :   return std::make_pair(AMDGPU::sub0 + Offset, 0);
    2902             : }
    2903             : 
    2904             : // Return true if the index is an SGPR and was set.
    2905         157 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
    2906             :                                  MachineRegisterInfo &MRI,
    2907             :                                  MachineInstr &MI,
    2908             :                                  int Offset,
    2909             :                                  bool UseGPRIdxMode,
    2910             :                                  bool IsIndirectSrc) {
    2911             :   MachineBasicBlock *MBB = MI.getParent();
    2912             :   const DebugLoc &DL = MI.getDebugLoc();
    2913             :   MachineBasicBlock::iterator I(&MI);
    2914             : 
    2915         157 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2916             :   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
    2917             : 
    2918             :   assert(Idx->getReg() != AMDGPU::NoRegister);
    2919             : 
    2920         157 :   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
    2921             :     return false;
    2922             : 
    2923         125 :   if (UseGPRIdxMode) {
    2924          30 :     unsigned IdxMode = IsIndirectSrc ?
    2925             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2926          30 :     if (Offset == 0) {
    2927             :       MachineInstr *SetOn =
    2928          16 :           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2929             :               .add(*Idx)
    2930          16 :               .addImm(IdxMode);
    2931             : 
    2932             :       SetOn->getOperand(3).setIsUndef();
    2933             :     } else {
    2934          14 :       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    2935          14 :       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
    2936             :           .add(*Idx)
    2937          14 :           .addImm(Offset);
    2938             :       MachineInstr *SetOn =
    2939          28 :         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2940          14 :         .addReg(Tmp, RegState::Kill)
    2941          14 :         .addImm(IdxMode);
    2942             : 
    2943             :       SetOn->getOperand(3).setIsUndef();
    2944             :     }
    2945             : 
    2946             :     return true;
    2947             :   }
    2948             : 
    2949          95 :   if (Offset == 0) {
    2950         162 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2951             :       .add(*Idx);
    2952             :   } else {
    2953          14 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2954             :       .add(*Idx)
    2955          14 :       .addImm(Offset);
    2956             :   }
    2957             : 
    2958             :   return true;
    2959             : }
    2960             : 
    2961             : // Control flow needs to be inserted if indexing with a VGPR.
    2962          67 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
    2963             :                                           MachineBasicBlock &MBB,
    2964             :                                           const SISubtarget &ST) {
    2965             :   const SIInstrInfo *TII = ST.getInstrInfo();
    2966             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2967             :   MachineFunction *MF = MBB.getParent();
    2968             :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2969             : 
    2970             :   unsigned Dst = MI.getOperand(0).getReg();
    2971          67 :   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
    2972          67 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2973             : 
    2974             :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
    2975             : 
    2976             :   unsigned SubReg;
    2977             :   std::tie(SubReg, Offset)
    2978         134 :     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
    2979             : 
    2980             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    2981             : 
    2982          67 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
    2983             :     MachineBasicBlock::iterator I(&MI);
    2984             :     const DebugLoc &DL = MI.getDebugLoc();
    2985             : 
    2986          55 :     if (UseGPRIdxMode) {
    2987             :       // TODO: Look at the uses to avoid the copy. This may require rescheduling
    2988             :       // to avoid interfering with other uses, so probably requires a new
    2989             :       // optimization pass.
    2990          32 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2991          16 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2992          16 :         .addReg(SrcReg, RegState::Implicit)
    2993          16 :         .addReg(AMDGPU::M0, RegState::Implicit);
    2994          16 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2995             :     } else {
    2996          78 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2997          39 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2998          39 :         .addReg(SrcReg, RegState::Implicit);
    2999             :     }
    3000             : 
    3001          55 :     MI.eraseFromParent();
    3002             : 
    3003             :     return &MBB;
    3004             :   }
    3005             : 
    3006             :   const DebugLoc &DL = MI.getDebugLoc();
    3007             :   MachineBasicBlock::iterator I(&MI);
    3008             : 
    3009          12 :   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3010          12 :   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3011             : 
    3012          12 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
    3013             : 
    3014             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
    3015          12 :                               Offset, UseGPRIdxMode, true);
    3016             :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3017             : 
    3018          12 :   if (UseGPRIdxMode) {
    3019          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    3020           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3021           6 :       .addReg(SrcReg, RegState::Implicit)
    3022           6 :       .addReg(AMDGPU::M0, RegState::Implicit);
    3023           6 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3024             :   } else {
    3025          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    3026           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3027           6 :       .addReg(SrcReg, RegState::Implicit);
    3028             :   }
    3029             : 
    3030          12 :   MI.eraseFromParent();
    3031             : 
    3032          12 :   return LoopBB;
    3033             : }
    3034             : 
    3035          66 : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
    3036             :                                  const TargetRegisterClass *VecRC) {
    3037          66 :   switch (TRI.getRegSizeInBits(*VecRC)) {
    3038             :   case 32: // 4 bytes
    3039             :     return AMDGPU::V_MOVRELD_B32_V1;
    3040           6 :   case 64: // 8 bytes
    3041           6 :     return AMDGPU::V_MOVRELD_B32_V2;
    3042          38 :   case 128: // 16 bytes
    3043          38 :     return AMDGPU::V_MOVRELD_B32_V4;
    3044          16 :   case 256: // 32 bytes
    3045          16 :     return AMDGPU::V_MOVRELD_B32_V8;
    3046           6 :   case 512: // 64 bytes
    3047           6 :     return AMDGPU::V_MOVRELD_B32_V16;
    3048           0 :   default:
    3049           0 :     llvm_unreachable("unsupported size for MOVRELD pseudos");
    3050             :   }
    3051             : }
    3052             : 
    3053          90 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
    3054             :                                           MachineBasicBlock &MBB,
    3055             :                                           const SISubtarget &ST) {
    3056             :   const SIInstrInfo *TII = ST.getInstrInfo();
    3057             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    3058             :   MachineFunction *MF = MBB.getParent();
    3059             :   MachineRegisterInfo &MRI = MF->getRegInfo();
    3060             : 
    3061             :   unsigned Dst = MI.getOperand(0).getReg();
    3062          90 :   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    3063          90 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    3064          90 :   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    3065          90 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    3066             :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
    3067             : 
    3068             :   // This can be an immediate, but will be folded later.
    3069             :   assert(Val->getReg());
    3070             : 
    3071             :   unsigned SubReg;
    3072         180 :   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
    3073             :                                                          SrcVec->getReg(),
    3074             :                                                          Offset);
    3075             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    3076             : 
    3077          90 :   if (Idx->getReg() == AMDGPU::NoRegister) {
    3078             :     MachineBasicBlock::iterator I(&MI);
    3079             :     const DebugLoc &DL = MI.getDebugLoc();
    3080             : 
    3081             :     assert(Offset == 0);
    3082             : 
    3083           0 :     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
    3084             :         .add(*SrcVec)
    3085             :         .add(*Val)
    3086           0 :         .addImm(SubReg);
    3087             : 
    3088           0 :     MI.eraseFromParent();
    3089             :     return &MBB;
    3090             :   }
    3091             : 
    3092          90 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
    3093             :     MachineBasicBlock::iterator I(&MI);
    3094             :     const DebugLoc &DL = MI.getDebugLoc();
    3095             : 
    3096          70 :     if (UseGPRIdxMode) {
    3097          28 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3098          14 :           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
    3099             :           .add(*Val)
    3100          14 :           .addReg(Dst, RegState::ImplicitDefine)
    3101          14 :           .addReg(SrcVec->getReg(), RegState::Implicit)
    3102          14 :           .addReg(AMDGPU::M0, RegState::Implicit);
    3103             : 
    3104          14 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3105             :     } else {
    3106          56 :       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3107             : 
    3108         112 :       BuildMI(MBB, I, DL, MovRelDesc)
    3109          56 :           .addReg(Dst, RegState::Define)
    3110          56 :           .addReg(SrcVec->getReg())
    3111             :           .add(*Val)
    3112          56 :           .addImm(SubReg - AMDGPU::sub0);
    3113             :     }
    3114             : 
    3115          70 :     MI.eraseFromParent();
    3116             :     return &MBB;
    3117             :   }
    3118             : 
    3119          20 :   if (Val->isReg())
    3120          20 :     MRI.clearKillFlags(Val->getReg());
    3121             : 
    3122             :   const DebugLoc &DL = MI.getDebugLoc();
    3123             : 
    3124          20 :   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
    3125             : 
    3126             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
    3127          20 :                               Offset, UseGPRIdxMode, false);
    3128             :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3129             : 
    3130          20 :   if (UseGPRIdxMode) {
    3131          20 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3132          10 :         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
    3133             :         .add(*Val)                               // src0
    3134          10 :         .addReg(Dst, RegState::ImplicitDefine)
    3135          10 :         .addReg(PhiReg, RegState::Implicit)
    3136          10 :         .addReg(AMDGPU::M0, RegState::Implicit);
    3137          10 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3138             :   } else {
    3139          10 :     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3140             : 
    3141          20 :     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
    3142          10 :         .addReg(Dst, RegState::Define)
    3143          10 :         .addReg(PhiReg)
    3144             :         .add(*Val)
    3145          10 :         .addImm(SubReg - AMDGPU::sub0);
    3146             :   }
    3147             : 
    3148          20 :   MI.eraseFromParent();
    3149             : 
    3150          20 :   return LoopBB;
    3151             : }
    3152             : 
    3153       12118 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    3154             :   MachineInstr &MI, MachineBasicBlock *BB) const {
    3155             : 
    3156       12118 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3157             :   MachineFunction *MF = BB->getParent();
    3158       12118 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    3159             : 
    3160       12118 :   if (TII->isMIMG(MI)) {
    3161         376 :     if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
    3162           0 :       report_fatal_error("missing mem operand from MIMG instruction");
    3163             :     }
    3164             :     // Add a memoperand for mimg instructions so that they aren't assumed to
    3165             :     // be ordered memory instuctions.
    3166             : 
    3167             :     return BB;
    3168             :   }
    3169             : 
    3170       11742 :   switch (MI.getOpcode()) {
    3171        2345 :   case AMDGPU::S_ADD_U64_PSEUDO:
    3172             :   case AMDGPU::S_SUB_U64_PSEUDO: {
    3173             :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3174             :     const DebugLoc &DL = MI.getDebugLoc();
    3175             : 
    3176             :     MachineOperand &Dest = MI.getOperand(0);
    3177             :     MachineOperand &Src0 = MI.getOperand(1);
    3178             :     MachineOperand &Src1 = MI.getOperand(2);
    3179             : 
    3180        2345 :     unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3181        2345 :     unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3182             : 
    3183             :     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3184             :      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3185        2345 :      &AMDGPU::SReg_32_XM0RegClass);
    3186             :     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3187             :       Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3188        2345 :       &AMDGPU::SReg_32_XM0RegClass);
    3189             : 
    3190             :     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3191             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3192        2345 :       &AMDGPU::SReg_32_XM0RegClass);
    3193             :     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3194             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3195        2345 :       &AMDGPU::SReg_32_XM0RegClass);
    3196             : 
    3197             :     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    3198             : 
    3199        2345 :     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
    3200        2345 :     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
    3201        2345 :     BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
    3202             :       .add(Src0Sub0)
    3203             :       .add(Src1Sub0);
    3204        2345 :     BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
    3205             :       .add(Src0Sub1)
    3206             :       .add(Src1Sub1);
    3207        4690 :     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
    3208        2345 :       .addReg(DestSub0)
    3209             :       .addImm(AMDGPU::sub0)
    3210        2345 :       .addReg(DestSub1)
    3211             :       .addImm(AMDGPU::sub1);
    3212        2345 :     MI.eraseFromParent();
    3213             :     return BB;
    3214             :   }
    3215        7615 :   case AMDGPU::SI_INIT_M0: {
    3216       15230 :     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
    3217        7615 :             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    3218             :         .add(MI.getOperand(0));
    3219        7615 :     MI.eraseFromParent();
    3220        7615 :     return BB;
    3221             :   }
    3222           2 :   case AMDGPU::SI_INIT_EXEC:
    3223             :     // This should be before all vector instructions.
    3224             :     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    3225             :             AMDGPU::EXEC)
    3226             :         .addImm(MI.getOperand(0).getImm());
    3227           2 :     MI.eraseFromParent();
    3228           2 :     return BB;
    3229             : 
    3230             :   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    3231             :     // Extract the thread count from an SGPR input and set EXEC accordingly.
    3232             :     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    3233             :     //
    3234             :     // S_BFE_U32 count, input, {shift, 7}
    3235             :     // S_BFM_B64 exec, count, 0
    3236             :     // S_CMP_EQ_U32 count, 64
    3237             :     // S_CMOV_B64 exec, -1
    3238             :     MachineInstr *FirstMI = &*BB->begin();
    3239             :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3240             :     unsigned InputReg = MI.getOperand(0).getReg();
    3241           4 :     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3242             :     bool Found = false;
    3243             : 
    3244             :     // Move the COPY of the input reg to the beginning, so that we can use it.
    3245          14 :     for (auto I = BB->begin(); I != &MI; I++) {
    3246          16 :       if (I->getOpcode() != TargetOpcode::COPY ||
    3247             :           I->getOperand(0).getReg() != InputReg)
    3248             :         continue;
    3249             : 
    3250           4 :       if (I == FirstMI) {
    3251           0 :         FirstMI = &*++BB->begin();
    3252             :       } else {
    3253           4 :         I->removeFromParent();
    3254             :         BB->insert(FirstMI, &*I);
    3255             :       }
    3256             :       Found = true;
    3257             :       break;
    3258             :     }
    3259             :     assert(Found);
    3260             :     (void)Found;
    3261             : 
    3262             :     // This should be before all vector instructions.
    3263          16 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
    3264           4 :         .addReg(InputReg)
    3265           4 :         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
    3266          16 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
    3267             :             AMDGPU::EXEC)
    3268           4 :         .addReg(CountReg)
    3269             :         .addImm(0);
    3270          16 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
    3271           4 :         .addReg(CountReg, RegState::Kill)
    3272             :         .addImm(64);
    3273           8 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
    3274             :             AMDGPU::EXEC)
    3275             :         .addImm(-1);
    3276           4 :     MI.eraseFromParent();
    3277           4 :     return BB;
    3278             :   }
    3279             : 
    3280             :   case AMDGPU::GET_GROUPSTATICSIZE: {
    3281             :     DebugLoc DL = MI.getDebugLoc();
    3282          61 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
    3283             :         .add(MI.getOperand(0))
    3284          61 :         .addImm(MFI->getLDSSize());
    3285          61 :     MI.eraseFromParent();
    3286             :     return BB;
    3287             :   }
    3288          67 :   case AMDGPU::SI_INDIRECT_SRC_V1:
    3289             :   case AMDGPU::SI_INDIRECT_SRC_V2:
    3290             :   case AMDGPU::SI_INDIRECT_SRC_V4:
    3291             :   case AMDGPU::SI_INDIRECT_SRC_V8:
    3292             :   case AMDGPU::SI_INDIRECT_SRC_V16:
    3293          67 :     return emitIndirectSrc(MI, *BB, *getSubtarget());
    3294          90 :   case AMDGPU::SI_INDIRECT_DST_V1:
    3295             :   case AMDGPU::SI_INDIRECT_DST_V2:
    3296             :   case AMDGPU::SI_INDIRECT_DST_V4:
    3297             :   case AMDGPU::SI_INDIRECT_DST_V8:
    3298             :   case AMDGPU::SI_INDIRECT_DST_V16:
    3299          90 :     return emitIndirectDst(MI, *BB, *getSubtarget());
    3300          86 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    3301             :   case AMDGPU::SI_KILL_I1_PSEUDO:
    3302          86 :     return splitKillBlock(MI, BB);
    3303          49 :   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    3304             :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3305             : 
    3306             :     unsigned Dst = MI.getOperand(0).getReg();
    3307             :     unsigned Src0 = MI.getOperand(1).getReg();
    3308             :     unsigned Src1 = MI.getOperand(2).getReg();
    3309             :     const DebugLoc &DL = MI.getDebugLoc();
    3310             :     unsigned SrcCond = MI.getOperand(3).getReg();
    3311             : 
    3312          49 :     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3313          49 :     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3314          49 :     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    3315             : 
    3316          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
    3317          49 :       .addReg(SrcCond);
    3318          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    3319          49 :       .addReg(Src0, 0, AMDGPU::sub0)
    3320          49 :       .addReg(Src1, 0, AMDGPU::sub0)
    3321          49 :       .addReg(SrcCondCopy);
    3322          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    3323          49 :       .addReg(Src0, 0, AMDGPU::sub1)
    3324          49 :       .addReg(Src1, 0, AMDGPU::sub1)
    3325          49 :       .addReg(SrcCondCopy);
    3326             : 
    3327          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
    3328          49 :       .addReg(DstLo)
    3329             :       .addImm(AMDGPU::sub0)
    3330          49 :       .addReg(DstHi)
    3331             :       .addImm(AMDGPU::sub1);
    3332          49 :     MI.eraseFromParent();
    3333          49 :     return BB;
    3334             :   }
    3335          78 :   case AMDGPU::SI_BR_UNDEF: {
    3336          78 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3337             :     const DebugLoc &DL = MI.getDebugLoc();
    3338         156 :     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
    3339             :                            .add(MI.getOperand(0));
    3340             :     Br->getOperand(1).setIsUndef(true); // read undef SCC
    3341          78 :     MI.eraseFromParent();
    3342          78 :     return BB;
    3343             :   }
    3344         874 :   case AMDGPU::ADJCALLSTACKUP:
    3345             :   case AMDGPU::ADJCALLSTACKDOWN: {
    3346         874 :     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    3347             :     MachineInstrBuilder MIB(*MF, &MI);
    3348         874 :     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
    3349         874 :         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
    3350             :     return BB;
    3351             :   }
    3352         471 :   case AMDGPU::SI_CALL_ISEL:
    3353             :   case AMDGPU::SI_TCRETURN_ISEL: {
    3354         471 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3355             :     const DebugLoc &DL = MI.getDebugLoc();
    3356             :     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
    3357             : 
    3358             :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3359             :     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
    3360         471 :     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
    3361             :     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
    3362             : 
    3363             :     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
    3364             : 
    3365             :     MachineInstrBuilder MIB;
    3366         471 :     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
    3367         437 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
    3368             :         .add(MI.getOperand(0))
    3369             :         .addGlobalAddress(G);
    3370             :     } else {
    3371          34 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
    3372             :         .add(MI.getOperand(0))
    3373             :         .addGlobalAddress(G);
    3374             : 
    3375             :       // There is an additional imm operand for tcreturn, but it should be in the
    3376             :       // right place already.
    3377             :     }
    3378             : 
    3379        5731 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
    3380             :       MIB.add(MI.getOperand(I));
    3381             : 
    3382             :     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3383         471 :     MI.eraseFromParent();
    3384             :     return BB;
    3385             :   }
    3386           0 :   default:
    3387           0 :     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    3388             :   }
    3389             : }
    3390             : 
    3391       21524 : bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
    3392       43048 :   return isTypeLegal(VT.getScalarType());
    3393             : }
    3394             : 
    3395        3980 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    3396             :   // This currently forces unfolding various combinations of fsub into fma with
    3397             :   // free fneg'd operands. As long as we have fast FMA (controlled by
    3398             :   // isFMAFasterThanFMulAndFAdd), we should perform these.
    3399             : 
    3400             :   // When fma is quarter rate, for f64 where add / sub are at best half rate,
    3401             :   // most of these combines appear to be cycle neutral but save on instruction
    3402             :   // count / code size.
    3403        3980 :   return true;
    3404             : }
    3405             : 
    3406       13252 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
    3407             :                                          EVT VT) const {
    3408       13252 :   if (!VT.isVector()) {
    3409       13181 :     return MVT::i1;
    3410             :   }
    3411         142 :   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
    3412             : }
    3413             : 
    3414      119079 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
    3415             :   // TODO: Should i16 be used always if legal? For now it would force VALU
    3416             :   // shifts.
    3417      119079 :   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
    3418             : }
    3419             : 
    3420             : // Answering this is somewhat tricky and depends on the specific device which
    3421             : // have different rates for fma or all f64 operations.
    3422             : //
    3423             : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
    3424             : // regardless of which device (although the number of cycles differs between
    3425             : // devices), so it is always profitable for f64.
    3426             : //
    3427             : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
    3428             : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
    3429             : // which we can always do even without fused FP ops since it returns the same
    3430             : // result as the separate operations and since it is always full
    3431             : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
    3432             : // however does not support denormals, so we do report fma as faster if we have
    3433             : // a fast fma device and require denormals.
    3434             : //
    3435       10789 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    3436       10789 :   VT = VT.getScalarType();
    3437             : 
    3438       10789 :   switch (VT.getSimpleVT().SimpleTy) {
    3439        8210 :   case MVT::f32:
    3440             :     // This is as fast on some subtargets. However, we always have full rate f32
    3441             :     // mad available which returns the same result as the separate operations
    3442             :     // which we should prefer over fma. We can't use this if we want to support
    3443             :     // denormals, so only report this in these cases.
    3444        8210 :     return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
    3445             :   case MVT::f64:
    3446             :     return true;
    3447        1663 :   case MVT::f16:
    3448        1663 :     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
    3449             :   default:
    3450             :     break;
    3451             :   }
    3452             : 
    3453           0 :   return false;
    3454             : }
    3455             : 
    3456             : //===----------------------------------------------------------------------===//
    3457             : // Custom DAG Lowering Operations
    3458             : //===----------------------------------------------------------------------===//
    3459             : 
    3460      187759 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    3461      187759 :   switch (Op.getOpcode()) {
    3462       19304 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    3463        1563 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    3464       80641 :   case ISD::LOAD: {
    3465       80641 :     SDValue Result = LowerLOAD(Op, DAG);
    3466             :     assert((!Result.getNode() ||
    3467             :             Result.getNode()->getNumValues() == 2) &&
    3468             :            "Load should return a value and a chain");
    3469       80641 :     return Result;
    3470             :   }
    3471             : 
    3472          51 :   case ISD::FSIN:
    3473             :   case ISD::FCOS:
    3474          51 :     return LowerTrig(Op, DAG);
    3475         638 :   case ISD::SELECT: return LowerSELECT(Op, DAG);
    3476         266 :   case ISD::FDIV: return LowerFDIV(Op, DAG);
    3477         261 :   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
    3478       72958 :   case ISD::STORE: return LowerSTORE(Op, DAG);
    3479         860 :   case ISD::GlobalAddress: {
    3480             :     MachineFunction &MF = DAG.getMachineFunction();
    3481         860 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    3482         860 :     return LowerGlobalAddress(MFI, Op, DAG);
    3483             :   }
    3484        6092 :   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    3485        1370 :   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
    3486        2099 :   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
    3487          45 :   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
    3488          58 :   case ISD::INSERT_VECTOR_ELT:
    3489          58 :     return lowerINSERT_VECTOR_ELT(Op, DAG);
    3490        1040 :   case ISD::EXTRACT_VECTOR_ELT:
    3491        1040 :     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
    3492         477 :   case ISD::FP_ROUND:
    3493         477 :     return lowerFP_ROUND(Op, DAG);
    3494          36 :   case ISD::TRAP:
    3495             :   case ISD::DEBUGTRAP:
    3496          36 :     return lowerTRAP(Op, DAG);
    3497             :   }
    3498             :   return SDValue();
    3499             : }
    3500             : 
    3501             : static unsigned getImageOpcode(unsigned IID) {
    3502             :   switch (IID) {
    3503             :   case Intrinsic::amdgcn_image_load:
    3504             :     return AMDGPUISD::IMAGE_LOAD;
    3505             :   case Intrinsic::amdgcn_image_load_mip:
    3506             :     return AMDGPUISD::IMAGE_LOAD_MIP;
    3507             : 
    3508             :   // Basic sample.
    3509             :   case Intrinsic::amdgcn_image_sample:
    3510             :     return AMDGPUISD::IMAGE_SAMPLE;
    3511             :   case Intrinsic::amdgcn_image_sample_cl:
    3512             :     return AMDGPUISD::IMAGE_SAMPLE_CL;
    3513             :   case Intrinsic::amdgcn_image_sample_d:
    3514             :     return AMDGPUISD::IMAGE_SAMPLE_D;
    3515             :   case Intrinsic::amdgcn_image_sample_d_cl:
    3516             :     return AMDGPUISD::IMAGE_SAMPLE_D_CL;
    3517             :   case Intrinsic::amdgcn_image_sample_l:
    3518             :     return AMDGPUISD::IMAGE_SAMPLE_L;
    3519             :   case Intrinsic::amdgcn_image_sample_b:
    3520             :     return AMDGPUISD::IMAGE_SAMPLE_B;
    3521             :   case Intrinsic::amdgcn_image_sample_b_cl:
    3522             :     return AMDGPUISD::IMAGE_SAMPLE_B_CL;
    3523             :   case Intrinsic::amdgcn_image_sample_lz:
    3524             :     return AMDGPUISD::IMAGE_SAMPLE_LZ;
    3525             :   case Intrinsic::amdgcn_image_sample_cd:
    3526             :     return AMDGPUISD::IMAGE_SAMPLE_CD;
    3527             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    3528             :     return AMDGPUISD::IMAGE_SAMPLE_CD_CL;
    3529             : 
    3530             :   // Sample with comparison.
    3531             :   case Intrinsic::amdgcn_image_sample_c:
    3532             :     return AMDGPUISD::IMAGE_SAMPLE_C;
    3533             :   case Intrinsic::amdgcn_image_sample_c_cl:
    3534             :     return AMDGPUISD::IMAGE_SAMPLE_C_CL;
    3535             :   case Intrinsic::amdgcn_image_sample_c_d:
    3536             :     return AMDGPUISD::IMAGE_SAMPLE_C_D;
    3537             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    3538             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_CL;
    3539             :   case Intrinsic::amdgcn_image_sample_c_l:
    3540             :     return AMDGPUISD::IMAGE_SAMPLE_C_L;
    3541             :   case Intrinsic::amdgcn_image_sample_c_b:
    3542             :     return AMDGPUISD::IMAGE_SAMPLE_C_B;
    3543             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    3544             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_CL;
    3545             :   case Intrinsic::amdgcn_image_sample_c_lz:
    3546             :     return AMDGPUISD::IMAGE_SAMPLE_C_LZ;
    3547             :   case Intrinsic::amdgcn_image_sample_c_cd:
    3548             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD;
    3549             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    3550             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL;
    3551             : 
    3552             :   // Sample with offsets.
    3553             :   case Intrinsic::amdgcn_image_sample_o:
    3554             :     return AMDGPUISD::IMAGE_SAMPLE_O;
    3555             :   case Intrinsic::amdgcn_image_sample_cl_o:
    3556             :     return AMDGPUISD::IMAGE_SAMPLE_CL_O;
    3557             :   case Intrinsic::amdgcn_image_sample_d_o:
    3558             :     return AMDGPUISD::IMAGE_SAMPLE_D_O;
    3559             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    3560             :     return AMDGPUISD::IMAGE_SAMPLE_D_CL_O;
    3561             :   case Intrinsic::amdgcn_image_sample_l_o:
    3562             :     return AMDGPUISD::IMAGE_SAMPLE_L_O;
    3563             :   case Intrinsic::amdgcn_image_sample_b_o:
    3564             :     return AMDGPUISD::IMAGE_SAMPLE_B_O;
    3565             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    3566             :     return AMDGPUISD::IMAGE_SAMPLE_B_CL_O;
    3567             :   case Intrinsic::amdgcn_image_sample_lz_o:
    3568             :     return AMDGPUISD::IMAGE_SAMPLE_LZ_O;
    3569             :   case Intrinsic::amdgcn_image_sample_cd_o:
    3570             :     return AMDGPUISD::IMAGE_SAMPLE_CD_O;
    3571             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    3572             :     return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O;
    3573             : 
    3574             :   // Sample with comparison and offsets.
    3575             :   case Intrinsic::amdgcn_image_sample_c_o:
    3576             :     return AMDGPUISD::IMAGE_SAMPLE_C_O;
    3577             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    3578             :     return AMDGPUISD::IMAGE_SAMPLE_C_CL_O;
    3579             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    3580             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_O;
    3581             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    3582             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O;
    3583             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    3584             :     return AMDGPUISD::IMAGE_SAMPLE_C_L_O;
    3585             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    3586             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_O;
    3587             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    3588             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O;
    3589             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    3590             :     return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O;
    3591             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    3592             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_O;
    3593             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    3594             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O;
    3595             : 
    3596             :   // Basic gather4.
    3597             :   case Intrinsic::amdgcn_image_gather4:
    3598             :     return AMDGPUISD::IMAGE_GATHER4;
    3599             :   case Intrinsic::amdgcn_image_gather4_cl:
    3600             :     return AMDGPUISD::IMAGE_GATHER4_CL;
    3601             :   case Intrinsic::amdgcn_image_gather4_l:
    3602             :     return AMDGPUISD::IMAGE_GATHER4_L;
    3603             :   case Intrinsic::amdgcn_image_gather4_b:
    3604             :     return AMDGPUISD::IMAGE_GATHER4_B;
    3605             :   case Intrinsic::amdgcn_image_gather4_b_cl:
    3606             :     return AMDGPUISD::IMAGE_GATHER4_B_CL;
    3607             :   case Intrinsic::amdgcn_image_gather4_lz:
    3608             :     return AMDGPUISD::IMAGE_GATHER4_LZ;
    3609             : 
    3610             :   // Gather4 with comparison.
    3611             :   case Intrinsic::amdgcn_image_gather4_c:
    3612             :     return AMDGPUISD::IMAGE_GATHER4_C;
    3613             :   case Intrinsic::amdgcn_image_gather4_c_cl:
    3614             :     return AMDGPUISD::IMAGE_GATHER4_C_CL;
    3615             :   case Intrinsic::amdgcn_image_gather4_c_l:
    3616             :     return AMDGPUISD::IMAGE_GATHER4_C_L;
    3617             :   case Intrinsic::amdgcn_image_gather4_c_b:
    3618             :     return AMDGPUISD::IMAGE_GATHER4_C_B;
    3619             :   case Intrinsic::amdgcn_image_gather4_c_b_cl:
    3620             :     return AMDGPUISD::IMAGE_GATHER4_C_B_CL;
    3621             :   case Intrinsic::amdgcn_image_gather4_c_lz:
    3622             :     return AMDGPUISD::IMAGE_GATHER4_C_LZ;
    3623             : 
    3624             :   // Gather4 with offsets.
    3625             :   case Intrinsic::amdgcn_image_gather4_o:
    3626             :     return AMDGPUISD::IMAGE_GATHER4_O;
    3627             :   case Intrinsic::amdgcn_image_gather4_cl_o:
    3628             :     return AMDGPUISD::IMAGE_GATHER4_CL_O;
    3629             :   case Intrinsic::amdgcn_image_gather4_l_o:
    3630             :     return AMDGPUISD::IMAGE_GATHER4_L_O;
    3631             :   case Intrinsic::amdgcn_image_gather4_b_o:
    3632             :     return AMDGPUISD::IMAGE_GATHER4_B_O;
    3633             :   case Intrinsic::amdgcn_image_gather4_b_cl_o:
    3634             :     return AMDGPUISD::IMAGE_GATHER4_B_CL_O;
    3635             :   case Intrinsic::amdgcn_image_gather4_lz_o:
    3636             :     return AMDGPUISD::IMAGE_GATHER4_LZ_O;
    3637             : 
    3638             :   // Gather4 with comparison and offsets.
    3639             :   case Intrinsic::amdgcn_image_gather4_c_o:
    3640             :     return AMDGPUISD::IMAGE_GATHER4_C_O;
    3641             :   case Intrinsic::amdgcn_image_gather4_c_cl_o:
    3642             :     return AMDGPUISD::IMAGE_GATHER4_C_CL_O;
    3643             :   case Intrinsic::amdgcn_image_gather4_c_l_o:
    3644             :     return AMDGPUISD::IMAGE_GATHER4_C_L_O;
    3645             :   case Intrinsic::amdgcn_image_gather4_c_b_o:
    3646             :     return AMDGPUISD::IMAGE_GATHER4_C_B_O;
    3647             :   case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
    3648             :     return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O;
    3649             :   case Intrinsic::amdgcn_image_gather4_c_lz_o:
    3650             :     return AMDGPUISD::IMAGE_GATHER4_C_LZ_O;
    3651             : 
    3652             :   default:
    3653             :     break;
    3654             :   }
    3655             :   return 0;
    3656             : }
    3657             : 
    3658          52 : static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL,
    3659             :                                    SelectionDAG &DAG, bool Unpacked) {
    3660          52 :   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
    3661             :     // Truncate to v2i16/v4i16.
    3662          19 :     EVT IntLoadVT = LoadVT.changeTypeToInteger();
    3663          19 :     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result);
    3664             :     // Bitcast to original type (v2f16/v4f16).
    3665          19 :     return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
    3666             :   }
    3667             :   // Cast back to the original packed type.
    3668          33 :   return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3669             : }
    3670             : 
    3671             : // This is to lower INTRINSIC_W_CHAIN with illegal result types.
    3672          52 : SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op,
    3673             :                                      SDValue &Chain, SelectionDAG &DAG) const {
    3674          52 :   EVT LoadVT = Op.getValueType();
    3675             :   // TODO: handle v3f16.
    3676             :   if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16)
    3677           0 :     return SDValue();
    3678             : 
    3679          52 :   bool Unpacked = Subtarget->hasUnpackedD16VMem();
    3680             :   EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
    3681             :   EVT EquivLoadVT = Unpacked ? UnpackedLoadVT :
    3682          52 :                                getEquivalentMemType(*DAG.getContext(), LoadVT);
    3683             :   // Change from v4f16/v2f16 to EquivLoadVT.
    3684          52 :   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
    3685             : 
    3686          52 :   SDValue Res;
    3687             :   SDLoc DL(Op);
    3688             :   MemSDNode *M = cast<MemSDNode>(Op);
    3689          52 :   unsigned IID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    3690          52 :   switch (IID) {
    3691           5 :   case Intrinsic::amdgcn_tbuffer_load: {
    3692             :     SDValue Ops[] = {
    3693             :       Op.getOperand(0),  // Chain
    3694             :       Op.getOperand(2),  // rsrc
    3695             :       Op.getOperand(3),  // vindex
    3696             :       Op.getOperand(4),  // voffset
    3697             :       Op.getOperand(5),  // soffset
    3698             :       Op.getOperand(6),  // offset
    3699             :       Op.getOperand(7),  // dfmt
    3700             :       Op.getOperand(8),  // nfmt
    3701             :       Op.getOperand(9),  // glc
    3702             :       Op.getOperand(10)  // slc
    3703           5 :     };
    3704           5 :     Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL,
    3705             :                                   VTList, Ops, M->getMemoryVT(),
    3706           5 :                                   M->getMemOperand());
    3707           5 :     Chain = Res.getValue(1);
    3708          15 :     return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
    3709             :   }
    3710           5 :   case Intrinsic::amdgcn_buffer_load_format: {
    3711             :     SDValue Ops[] = {
    3712             :       Op.getOperand(0), // Chain
    3713             :       Op.getOperand(2), // rsrc
    3714             :       Op.getOperand(3), // vindex
    3715             :       Op.getOperand(4), // offset
    3716             :       Op.getOperand(5), // glc
    3717             :       Op.getOperand(6)  // slc
    3718           5 :     };
    3719           5 :     Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
    3720             :                                    DL, VTList, Ops, M->getMemoryVT(),
    3721           5 :                                    M->getMemOperand());
    3722           5 :     Chain = Res.getValue(1);
    3723          15 :     return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
    3724             :   }
    3725           8 :   case Intrinsic::amdgcn_image_load:
    3726             :   case Intrinsic::amdgcn_image_load_mip: {
    3727             :     SDValue Ops[] = {
    3728             :         Op.getOperand(0),  // Chain
    3729             :         Op.getOperand(2),  // vaddr
    3730             :         Op.getOperand(3),  // rsrc
    3731             :         Op.getOperand(4),  // dmask
    3732             :         Op.getOperand(5),  // glc
    3733             :         Op.getOperand(6),  // slc
    3734             :         Op.getOperand(7),  // lwe
    3735             :         Op.getOperand(8)   // da
    3736           8 :     };
    3737             :     unsigned Opc = getImageOpcode(IID);
    3738           8 :     Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
    3739           8 :                                   M->getMemOperand());
    3740           8 :     Chain = Res.getValue(1);
    3741          24 :     return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
    3742             :   }
    3743             :   // Basic sample.
    3744          34 :   case Intrinsic::amdgcn_image_sample:
    3745             :   case Intrinsic::amdgcn_image_sample_cl:
    3746             :   case Intrinsic::amdgcn_image_sample_d:
    3747             :   case Intrinsic::amdgcn_image_sample_d_cl:
    3748             :   case Intrinsic::amdgcn_image_sample_l:
    3749             :   case Intrinsic::amdgcn_image_sample_b:
    3750             :   case Intrinsic::amdgcn_image_sample_b_cl:
    3751             :   case Intrinsic::amdgcn_image_sample_lz:
    3752             :   case Intrinsic::amdgcn_image_sample_cd:
    3753             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    3754             : 
    3755             :   // Sample with comparison.
    3756             :   case Intrinsic::amdgcn_image_sample_c:
    3757             :   case Intrinsic::amdgcn_image_sample_c_cl:
    3758             :   case Intrinsic::amdgcn_image_sample_c_d:
    3759             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    3760             :   case Intrinsic::amdgcn_image_sample_c_l:
    3761             :   case Intrinsic::amdgcn_image_sample_c_b:
    3762             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    3763             :   case Intrinsic::amdgcn_image_sample_c_lz:
    3764             :   case Intrinsic::amdgcn_image_sample_c_cd:
    3765             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    3766             : 
    3767             :   // Sample with offsets.
    3768             :   case Intrinsic::amdgcn_image_sample_o:
    3769             :   case Intrinsic::amdgcn_image_sample_cl_o:
    3770             :   case Intrinsic::amdgcn_image_sample_d_o:
    3771             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    3772             :   case Intrinsic::amdgcn_image_sample_l_o:
    3773             :   case Intrinsic::amdgcn_image_sample_b_o:
    3774             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    3775             :   case Intrinsic::amdgcn_image_sample_lz_o:
    3776             :   case Intrinsic::amdgcn_image_sample_cd_o:
    3777             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    3778             : 
    3779             :   // Sample with comparison and offsets.
    3780             :   case Intrinsic::amdgcn_image_sample_c_o:
    3781             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    3782             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    3783             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    3784             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    3785             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    3786             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    3787             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    3788             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    3789             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    3790             : 
    3791             :   // Basic gather4
    3792             :   case Intrinsic::amdgcn_image_gather4:
    3793             :   case Intrinsic::amdgcn_image_gather4_cl:
    3794             :   case Intrinsic::amdgcn_image_gather4_l:
    3795             :   case Intrinsic::amdgcn_image_gather4_b:
    3796             :   case Intrinsic::amdgcn_image_gather4_b_cl:
    3797             :   case Intrinsic::amdgcn_image_gather4_lz:
    3798             : 
    3799             :   // Gather4 with comparison
    3800             :   case Intrinsic::amdgcn_image_gather4_c:
    3801             :   case Intrinsic::amdgcn_image_gather4_c_cl:
    3802             :   case Intrinsic::amdgcn_image_gather4_c_l:
    3803             :   case Intrinsic::amdgcn_image_gather4_c_b:
    3804             :   case Intrinsic::amdgcn_image_gather4_c_b_cl:
    3805             :   case Intrinsic::amdgcn_image_gather4_c_lz:
    3806             : 
    3807             :   // Gather4 with offsets
    3808             :   case Intrinsic::amdgcn_image_gather4_o:
    3809             :   case Intrinsic::amdgcn_image_gather4_cl_o:
    3810             :   case Intrinsic::amdgcn_image_gather4_l_o:
    3811             :   case Intrinsic::amdgcn_image_gather4_b_o:
    3812             :   case Intrinsic::amdgcn_image_gather4_b_cl_o:
    3813             :   case Intrinsic::amdgcn_image_gather4_lz_o:
    3814             : 
    3815             :   // Gather4 with comparison and offsets
    3816             :   case Intrinsic::amdgcn_image_gather4_c_o:
    3817             :   case Intrinsic::amdgcn_image_gather4_c_cl_o:
    3818             :   case Intrinsic::amdgcn_image_gather4_c_l_o:
    3819             :   case Intrinsic::amdgcn_image_gather4_c_b_o:
    3820             :   case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
    3821             :   case Intrinsic::amdgcn_image_gather4_c_lz_o: {
    3822             :     SDValue Ops[] = {
    3823             :       Op.getOperand(0),  // Chain
    3824             :       Op.getOperand(2),  // vaddr
    3825             :       Op.getOperand(3),  // rsrc
    3826             :       Op.getOperand(4),  // sampler
    3827             :       Op.getOperand(5),  // dmask
    3828             :       Op.getOperand(6),  // unorm
    3829             :       Op.getOperand(7),  // glc
    3830             :       Op.getOperand(8),  // slc
    3831             :       Op.getOperand(9),  // lwe
    3832             :       Op.getOperand(10)  // da
    3833          34 :     };
    3834             :     unsigned Opc = getImageOpcode(IID);
    3835          34 :     Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
    3836          34 :                                    M->getMemOperand());
    3837          34 :     Chain = Res.getValue(1);
    3838         102 :     return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
    3839             :   }
    3840           0 :   default:
    3841           0 :     return SDValue();
    3842             :   }
    3843             : }
    3844             : 
    3845         329 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
    3846             :                                           SmallVectorImpl<SDValue> &Results,
    3847             :                                           SelectionDAG &DAG) const {
    3848         329 :   switch (N->getOpcode()) {
    3849             :   case ISD::INSERT_VECTOR_ELT: {
    3850          88 :     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
    3851          10 :       Results.push_back(Res);
    3852          88 :     return;
    3853             :   }
    3854             :   case ISD::EXTRACT_VECTOR_ELT: {
    3855           0 :     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
    3856           0 :       Results.push_back(Res);
    3857           0 :     return;
    3858             :   }
    3859         138 :   case ISD::INTRINSIC_WO_CHAIN: {
    3860         138 :     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    3861             :     switch (IID) {
    3862          54 :     case Intrinsic::amdgcn_cvt_pkrtz: {
    3863          54 :       SDValue Src0 = N->getOperand(1);
    3864          54 :       SDValue Src1 = N->getOperand(2);
    3865             :       SDLoc SL(N);
    3866             :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
    3867          54 :                                 Src0, Src1);
    3868         108 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
    3869             :       return;
    3870             :     }
    3871          84 :     case Intrinsic::amdgcn_cvt_pknorm_i16:
    3872             :     case Intrinsic::amdgcn_cvt_pknorm_u16:
    3873             :     case Intrinsic::amdgcn_cvt_pk_i16:
    3874             :     case Intrinsic::amdgcn_cvt_pk_u16: {
    3875          84 :       SDValue Src0 = N->getOperand(1);
    3876          84 :       SDValue Src1 = N->getOperand(2);
    3877             :       SDLoc SL(N);
    3878             :       unsigned Opcode;
    3879             : 
    3880          84 :       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
    3881             :         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    3882          57 :       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
    3883             :         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    3884          30 :       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
    3885             :         Opcode = AMDGPUISD::CVT_PK_I16_I32;
    3886             :       else
    3887             :         Opcode = AMDGPUISD::CVT_PK_U16_U32;
    3888             : 
    3889          84 :       SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
    3890         168 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
    3891             :       return;
    3892             :     }
    3893             :     }
    3894             :     break;
    3895             :   }
    3896          52 :   case ISD::INTRINSIC_W_CHAIN: {
    3897          52 :     SDValue Chain;
    3898          52 :     if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0),
    3899          52 :                                                              Chain, DAG)) {
    3900          52 :       Results.push_back(Res);
    3901          52 :       Results.push_back(Chain);
    3902          52 :       return;
    3903             :     }
    3904           0 :     break;
    3905             :   }
    3906             :   case ISD::SELECT: {
    3907             :     SDLoc SL(N);
    3908          40 :     EVT VT = N->getValueType(0);
    3909          40 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    3910          40 :     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
    3911          40 :     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
    3912             : 
    3913          40 :     EVT SelectVT = NewVT;
    3914          40 :     if (NewVT.bitsLT(MVT::i32)) {
    3915           2 :       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
    3916           2 :       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
    3917             :       SelectVT = MVT::i32;
    3918             :     }
    3919             : 
    3920             :     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
    3921          40 :                                     N->getOperand(0), LHS, RHS);
    3922             : 
    3923           0 :     if (NewVT != SelectVT)
    3924           2 :       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
    3925          80 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
    3926             :     return;
    3927             :   }
    3928             :   default:
    3929             :     break;
    3930             :   }
    3931             : }
    3932             : 
    3933             : /// \brief Helper function for LowerBRCOND
    3934             : static SDNode *findUser(SDValue Value, unsigned Opcode) {
    3935             : 
    3936             :   SDNode *Parent = Value.getNode();
    3937             :   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    3938        1470 :        I != E; ++I) {
    3939             : 
    3940         785 :     if (I.getUse().get() != Value)
    3941             :       continue;
    3942             : 
    3943         682 :     if (I->getOpcode() == Opcode)
    3944             :       return *I;
    3945             :   }
    3946             :   return nullptr;
    3947             : }
    3948             : 
    3949        1563 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    3950        1563 :   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    3951         447 :     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
    3952             :     case Intrinsic::amdgcn_if:
    3953             :       return AMDGPUISD::IF;
    3954          48 :     case Intrinsic::amdgcn_else:
    3955          48 :       return AMDGPUISD::ELSE;
    3956          54 :     case Intrinsic::amdgcn_loop:
    3957          54 :       return AMDGPUISD::LOOP;
    3958           0 :     case Intrinsic::amdgcn_end_cf:
    3959           0 :       llvm_unreachable("should not occur");
    3960           2 :     default:
    3961           2 :       return 0;
    3962             :     }
    3963             :   }
    3964             : 
    3965             :   // break, if_break, else_break are all only used as inputs to loop, not
    3966             :   // directly as branch conditions.
    3967             :   return 0;
    3968             : }
    3969             : 
    3970           4 : void SITargetLowering::createDebuggerPrologueStackObjects(
    3971             :     MachineFunction &MF) const {
    3972             :   // Create stack objects that are used for emitting debugger prologue.
    3973             :   //
    3974             :   // Debugger prologue writes work group IDs and work item IDs to scratch memory
    3975             :   // at fixed location in the following format:
    3976             :   //   offset 0:  work group ID x
    3977             :   //   offset 4:  work group ID y
    3978             :   //   offset 8:  work group ID z
    3979             :   //   offset 16: work item ID x
    3980             :   //   offset 20: work item ID y
    3981             :   //   offset 24: work item ID z
    3982           4 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3983             :   int ObjectIdx = 0;
    3984             : 
    3985             :   // For each dimension:
    3986          28 :   for (unsigned i = 0; i < 3; ++i) {
    3987             :     // Create fixed stack object for work group ID.
    3988          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
    3989             :     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
    3990             :     // Create fixed stack object for work item ID.
    3991          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
    3992             :     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
    3993             :   }
    3994           4 : }
    3995             : 
    3996        1091 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    3997             :   const Triple &TT = getTargetMachine().getTargetTriple();
    3998        1012 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    3999        1170 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    4000        1170 :          AMDGPU::shouldEmitConstantsToTextSection(TT);
    4001             : }
    4002             : 
    4003         561 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    4004         507 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    4005         453 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    4006         108 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    4007         734 :          !shouldEmitFixup(GV) &&
    4008         626 :          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
    4009             : }
    4010             : 
    4011         482 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
    4012         482 :   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
    4013             : }
    4014             : 
    4015             : /// This transforms the control flow intrinsics to get the branch destination as
    4016             : /// last parameter, also switches branch target with BR if the need arise
    4017        1563 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    4018             :                                       SelectionDAG &DAG) const {
    4019             :   SDLoc DL(BRCOND);
    4020             : 
    4021             :   SDNode *Intr = BRCOND.getOperand(1).getNode();
    4022        1563 :   SDValue Target = BRCOND.getOperand(2);
    4023             :   SDNode *BR = nullptr;
    4024             :   SDNode *SetCC = nullptr;
    4025             : 
    4026        1563 :   if (Intr->getOpcode() == ISD::SETCC) {
    4027             :     // As long as we negate the condition everything is fine
    4028             :     SetCC = Intr;
    4029             :     Intr = SetCC->getOperand(0).getNode();
    4030             : 
    4031             :   } else {
    4032             :     // Get the target from BR if we don't negate the condition
    4033             :     BR = findUser(BRCOND, ISD::BR);
    4034         294 :     Target = BR->getOperand(1);
    4035             :   }
    4036             : 
    4037             :   // FIXME: This changes the types of the intrinsics instead of introducing new
    4038             :   // nodes with the correct types.
    4039             :   // e.g. llvm.amdgcn.loop
    4040             : 
    4041             :   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
    4042             :   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
    4043             : 
    4044        1563 :   unsigned CFNode = isCFIntrinsic(Intr);
    4045        1563 :   if (CFNode == 0) {
    4046             :     // This is a uniform branch so we don't need to legalize.
    4047        1118 :     return BRCOND;
    4048             :   }
    4049             : 
    4050         445 :   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
    4051             :                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
    4052             : 
    4053             :   assert(!SetCC ||
    4054             :         (SetCC->getConstantOperandVal(1) == 1 &&
    4055             :          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    4056             :                                                              ISD::SETNE));
    4057             : 
    4058             :   // operands of the new intrinsic call
    4059             :   SmallVector<SDValue, 4> Ops;
    4060         445 :   if (HaveChain)
    4061         445 :     Ops.push_back(BRCOND.getOperand(0));
    4062             : 
    4063         445 :   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
    4064         445 :   Ops.push_back(Target);
    4065             : 
    4066         445 :   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
    4067             : 
    4068             :   // build the new intrinsic call
    4069         445 :   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
    4070             : 
    4071         445 :   if (!HaveChain) {
    4072             :     SDValue Ops[] =  {
    4073             :       SDValue(Result, 0),
    4074             :       BRCOND.getOperand(0)
    4075           0 :     };
    4076             : 
    4077           0 :     Result = DAG.getMergeValues(Ops, DL).getNode();
    4078             :   }
    4079             : 
    4080         445 :   if (BR) {
    4081             :     // Give the branch instruction our target
    4082             :     SDValue Ops[] = {
    4083             :       BR->getOperand(0),
    4084             :       BRCOND.getOperand(2)
    4085          86 :     };
    4086          86 :     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
    4087          86 :     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
    4088             :     BR = NewBR.getNode();
    4089             :   }
    4090             : 
    4091         445 :   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    4092             : 
    4093             :   // Copy the intrinsic results to registers
    4094         836 :   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    4095             :     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    4096         391 :     if (!CopyToReg)
    4097           3 :       continue;
    4098             : 
    4099         388 :     Chain = DAG.getCopyToReg(
    4100             :       Chain, DL,
    4101             :       CopyToReg->getOperand(1),
    4102             :       SDValue(Result, i - 1),
    4103         776 :       SDValue());
    4104             : 
    4105         388 :     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    4106             :   }
    4107             : 
    4108             :   // Remove the old intrinsic from the chain
    4109         890 :   DAG.ReplaceAllUsesOfValueWith(
    4110             :     SDValue(Intr, Intr->getNumValues() - 1),
    4111             :     Intr->getOperand(0));
    4112             : 
    4113         445 :   return Chain;
    4114             : }
    4115             : 
    4116        2571 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
    4117             :                                             SDValue Op,
    4118             :                                             const SDLoc &DL,
    4119             :                                             EVT VT) const {
    4120        5142 :   return Op.getValueType().bitsLE(VT) ?
    4121        2571 :       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
    4122        7713 :       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
    4123             : }
    4124             : 
    4125         477 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
    4126             :   assert(Op.getValueType() == MVT::f16 &&
    4127             :          "Do not know how to custom lower FP_ROUND for non-f16 type");
    4128             : 
    4129         477 :   SDValue Src = Op.getOperand(0);
    4130             :   EVT SrcVT = Src.getValueType();
    4131             :   if (SrcVT != MVT::f64)
    4132         467 :     return Op;
    4133             : 
    4134             :   SDLoc DL(Op);
    4135             : 
    4136          10 :   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
    4137          10 :   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
    4138          10 :   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
    4139             : }
    4140             : 
    4141          36 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
    4142             :   SDLoc SL(Op);
    4143             :   MachineFunction &MF = DAG.getMachineFunction();
    4144          36 :   SDValue Chain = Op.getOperand(0);
    4145             : 
    4146          36 :   unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
    4147             :     SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
    4148             : 
    4149          52 :   if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
    4150             :       Subtarget->isTrapHandlerEnabled()) {
    4151           8 :     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4152             :     unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4153             :     assert(UserSGPR != AMDGPU::NoRegister);
    4154             : 
    4155             :     SDValue QueuePtr = CreateLiveInRegister(
    4156          16 :       DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4157             : 
    4158           8 :     SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
    4159             : 
    4160             :     SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
    4161           8 :                                      QueuePtr, SDValue());
    4162             : 
    4163             :     SDValue Ops[] = {
    4164             :       ToReg,
    4165           8 :       DAG.getTargetConstant(TrapID, SL, MVT::i16),
    4166             :       SGPR01,
    4167             :       ToReg.getValue(1)
    4168          16 :     };
    4169             : 
    4170           8 :     return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4171             :   }
    4172             : 
    4173          28 :   switch (TrapID) {
    4174             :   case SISubtarget::TrapIDLLVMTrap:
    4175          21 :     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
    4176           7 :   case SISubtarget::TrapIDLLVMDebugTrap: {
    4177             :     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
    4178             :                                      "debugtrap handler not supported",
    4179             :                                      Op.getDebugLoc(),
    4180          14 :                                      DS_Warning);
    4181           7 :     LLVMContext &Ctx = MF.getFunction().getContext();
    4182           7 :     Ctx.diagnose(NoTrap);
    4183           7 :     return Chain;
    4184             :   }
    4185           0 :   default:
    4186           0 :     llvm_unreachable("unsupported trap handler type!");
    4187             :   }
    4188             : 
    4189             :   return Chain;
    4190             : }
    4191             : 
    4192          32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
    4193             :                                              SelectionDAG &DAG) const {
    4194             :   // FIXME: Use inline constants (src_{shared, private}_base) instead.
    4195          32 :   if (Subtarget->hasApertureRegs()) {
    4196          12 :     unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4197             :         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
    4198             :         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
    4199             :     unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4200             :         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
    4201             :         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
    4202          12 :     unsigned Encoding =
    4203             :         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
    4204          12 :         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
    4205             :         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
    4206             : 
    4207          24 :     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
    4208             :     SDValue ApertureReg = SDValue(
    4209          24 :         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
    4210          12 :     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
    4211          12 :     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
    4212             :   }
    4213             : 
    4214             :   MachineFunction &MF = DAG.getMachineFunction();
    4215          20 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4216             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4217             :   assert(UserSGPR != AMDGPU::NoRegister);
    4218             : 
    4219             :   SDValue QueuePtr = CreateLiveInRegister(
    4220          40 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4221             : 
    4222             :   // Offset into amd_queue_t for group_segment_aperture_base_hi /
    4223             :   // private_segment_aperture_base_hi.
    4224          20 :   uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
    4225             : 
    4226          20 :   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
    4227             : 
    4228             :   // TODO: Use custom target PseudoSourceValue.
    4229             :   // TODO: We should use the value from the IR intrinsic call, but it might not
    4230             :   // be available and how do we get it?
    4231          20 :   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
    4232          20 :                                               AMDGPUASI.CONSTANT_ADDRESS));
    4233             : 
    4234             :   MachinePointerInfo PtrInfo(V, StructOffset);
    4235             :   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
    4236             :                      MinAlign(64, StructOffset),
    4237             :                      MachineMemOperand::MODereferenceable |
    4238          40 :                          MachineMemOperand::MOInvariant);
    4239             : }
    4240             : 
    4241          45 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
    4242             :                                              SelectionDAG &DAG) const {
    4243             :   SDLoc SL(Op);
    4244             :   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
    4245             : 
    4246          45 :   SDValue Src = ASC->getOperand(0);
    4247          45 :   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
    4248             : 
    4249             :   const AMDGPUTargetMachine &TM =
    4250             :     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
    4251             : 
    4252             :   // flat -> local/private
    4253          45 :   if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4254             :     unsigned DestAS = ASC->getDestAddressSpace();
    4255             : 
    4256          17 :     if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
    4257           5 :         DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4258             :       unsigned NullVal = TM.getNullPointerValue(DestAS);
    4259          12 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4260          12 :       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
    4261          12 :       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
    4262             : 
    4263             :       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
    4264          12 :                          NonNull, Ptr, SegmentNullPtr);
    4265             :     }
    4266             :   }
    4267             : 
    4268             :   // local/private -> flat
    4269          33 :   if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4270             :     unsigned SrcAS = ASC->getSrcAddressSpace();
    4271             : 
    4272          54 :     if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
    4273          22 :         SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4274             :       unsigned NullVal = TM.getNullPointerValue(SrcAS);
    4275          32 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4276             : 
    4277             :       SDValue NonNull
    4278          32 :         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
    4279             : 
    4280          32 :       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
    4281             :       SDValue CvtPtr
    4282          32 :         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
    4283             : 
    4284             :       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
    4285             :                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
    4286          64 :                          FlatNullPtr);
    4287             :     }
    4288             :   }
    4289             : 
    4290             :   // global <-> flat are no-ops and never emitted.
    4291             : 
    4292             :   const MachineFunction &MF = DAG.getMachineFunction();
    4293             :   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
    4294           2 :     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
    4295           1 :   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
    4296             : 
    4297           1 :   return DAG.getUNDEF(ASC->getValueType(0));
    4298             : }
    4299             : 
    4300         146 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    4301             :                                                  SelectionDAG &DAG) const {
    4302         146 :   SDValue Idx = Op.getOperand(2);
    4303             :   if (isa<ConstantSDNode>(Idx))
    4304         132 :     return SDValue();
    4305             : 
    4306             :   // Avoid stack access for dynamic indexing.
    4307             :   SDLoc SL(Op);
    4308          14 :   SDValue Vec = Op.getOperand(0);
    4309          14 :   SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
    4310             : 
    4311             :   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
    4312          14 :   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
    4313             : 
    4314             :   // Convert vector index to bit-index.
    4315             :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
    4316          28 :                                   DAG.getConstant(16, SL, MVT::i32));
    4317             : 
    4318          14 :   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    4319             : 
    4320             :   SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
    4321             :                             DAG.getConstant(0xffff, SL, MVT::i32),
    4322          28 :                             ScaledIdx);
    4323             : 
    4324          14 :   SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
    4325             :   SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
    4326          28 :                             DAG.getNOT(SL, BFM, MVT::i32), BCVec);
    4327             : 
    4328          14 :   SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
    4329          14 :   return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
    4330             : }
    4331             : 
    4332        1040 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    4333             :                                                   SelectionDAG &DAG) const {
    4334             :   SDLoc SL(Op);
    4335             : 
    4336        1040 :   EVT ResultVT = Op.getValueType();
    4337        1040 :   SDValue Vec = Op.getOperand(0);
    4338        1040 :   SDValue Idx = Op.getOperand(1);
    4339             : 
    4340             :   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
    4341             : 
    4342             :   // Make sure we we do any optimizations that will make it easier to fold
    4343             :   // source modifiers before obscuring it with bit operations.
    4344             : 
    4345             :   // XXX - Why doesn't this get called when vector_shuffle is expanded?
    4346        1040 :   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
    4347           7 :     return Combined;
    4348             : 
    4349             :   if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
    4350        1022 :     SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    4351             : 
    4352        1022 :     if (CIdx->getZExtValue() == 1) {
    4353         538 :       Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
    4354        1076 :                            DAG.getConstant(16, SL, MVT::i32));
    4355             :     } else {
    4356             :       assert(CIdx->getZExtValue() == 0);
    4357             :     }
    4358             : 
    4359        1022 :     if (ResultVT.bitsLT(MVT::i32))
    4360         951 :       Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
    4361        1022 :     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    4362             :   }
    4363             : 
    4364          11 :   SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
    4365             : 
    4366             :   // Convert vector index to bit-index.
    4367          11 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
    4368             : 
    4369          11 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    4370          11 :   SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
    4371             : 
    4372          11 :   SDValue Result = Elt;
    4373          11 :   if (ResultVT.bitsLT(MVT::i32))
    4374           7 :     Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
    4375             : 
    4376          11 :   return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    4377             : }
    4378             : 
    4379             : bool
    4380        1679 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    4381             :   // We can fold offsets for anything that doesn't require a GOT relocation.
    4382        3330 :   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    4383        3251 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    4384        3358 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    4385        1758 :          !shouldEmitGOTReloc(GA->getGlobal());
    4386             : }
    4387             : 
    4388             : static SDValue
    4389         501 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
    4390             :                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
    4391             :                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
    4392             :   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
    4393             :   // lowered to the following code sequence:
    4394             :   //
    4395             :   // For constant address space:
    4396             :   //   s_getpc_b64 s[0:1]
    4397             :   //   s_add_u32 s0, s0, $symbol
    4398             :   //   s_addc_u32 s1, s1, 0
    4399             :   //
    4400             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4401             :   //   a fixup or relocation is emitted to replace $symbol with a literal
    4402             :   //   constant, which is a pc-relative offset from the encoding of the $symbol
    4403             :   //   operand to the global variable.
    4404             :   //
    4405             :   // For global address space:
    4406             :   //   s_getpc_b64 s[0:1]
    4407             :   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
    4408             :   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
    4409             :   //
    4410             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4411             :   //   fixups or relocations are emitted to replace $symbol@*@lo and
    4412             :   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
    4413             :   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
    4414             :   //   operand to the global variable.
    4415             :   //
    4416             :   // What we want here is an offset from the value returned by s_getpc
    4417             :   // (which is the address of the s_add_u32 instruction) to the global
    4418             :   // variable, but since the encoding of $symbol starts 4 bytes after the start
    4419             :   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
    4420             :   // small. This requires us to add 4 to the global variable offset in order to
    4421             :   // compute the correct address.
    4422         501 :   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4423        1503 :                                              GAFlags);
    4424             :   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4425             :                                              GAFlags == SIInstrInfo::MO_NONE ?
    4426        1002 :                                              GAFlags : GAFlags + 1);
    4427         501 :   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
    4428             : }
    4429             : 
    4430         860 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    4431             :                                              SDValue Op,
    4432             :                                              SelectionDAG &DAG) const {
    4433             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    4434             :   const GlobalValue *GV = GSD->getGlobal();
    4435             : 
    4436        1698 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
    4437        1676 :       GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
    4438        2510 :       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
    4439             :       // FIXME: It isn't correct to rely on the type of the pointer. This should
    4440             :       // be removed when address space 0 is 64-bit.
    4441             :       !GV->getType()->getElementType()->isFunctionTy())
    4442         359 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    4443             : 
    4444             :   SDLoc DL(GSD);
    4445         501 :   EVT PtrVT = Op.getValueType();
    4446             : 
    4447         501 :   if (shouldEmitFixup(GV))
    4448          19 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    4449         482 :   else if (shouldEmitPCReloc(GV))
    4450             :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
    4451         457 :                                    SIInstrInfo::MO_REL32);
    4452             : 
    4453             :   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
    4454          25 :                                             SIInstrInfo::MO_GOTPCREL32);
    4455             : 
    4456          25 :   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
    4457          25 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    4458             :   const DataLayout &DataLayout = DAG.getDataLayout();
    4459          25 :   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
    4460             :   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
    4461          25 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    4462             : 
    4463             :   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
    4464             :                      MachineMemOperand::MODereferenceable |
    4465          25 :                          MachineMemOperand::MOInvariant);
    4466             : }
    4467             : 
    4468        7619 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
    4469             :                                    const SDLoc &DL, SDValue V) const {
    4470             :   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
    4471             :   // the destination register.
    4472             :   //
    4473             :   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
    4474             :   // so we will end up with redundant moves to m0.
    4475             :   //
    4476             :   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
    4477             : 
    4478             :   // A Null SDValue creates a glue result.
    4479        7619 :   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
    4480        7619 :                                   V, Chain);
    4481        7619 :   return SDValue(M0, 0);
    4482             : }
    4483             : 
    4484          91 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
    4485             :                                                  SDValue Op,
    4486             :                                                  MVT VT,
    4487             :                                                  unsigned Offset) const {
    4488             :   SDLoc SL(Op);
    4489             :   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
    4490         182 :                                            DAG.getEntryNode(), Offset, false);
    4491             :   // The local size values will have the hi 16-bits as zero.
    4492             :   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
    4493         273 :                      DAG.getValueType(VT));
    4494             : }
    4495             : 
    4496           2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4497             :                                         EVT VT) {
    4498             :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4499             :                                       "non-hsa intrinsic with hsa target",
    4500           4 :                                       DL.getDebugLoc());
    4501           2 :   DAG.getContext()->diagnose(BadIntrin);
    4502           2 :   return DAG.getUNDEF(VT);
    4503             : }
    4504             : 
    4505           5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4506             :                                          EVT VT) {
    4507             :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4508             :                                       "intrinsic not supported on subtarget",
    4509          10 :                                       DL.getDebugLoc());
    4510           5 :   DAG.getContext()->diagnose(BadIntrin);
    4511           5 :   return DAG.getUNDEF(VT);
    4512             : }
    4513             : 
    4514        6092 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    4515             :                                                   SelectionDAG &DAG) const {
    4516             :   MachineFunction &MF = DAG.getMachineFunction();
    4517        6092 :   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    4518             : 
    4519        6092 :   EVT VT = Op.getValueType();
    4520             :   SDLoc DL(Op);
    4521        6092 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4522             : 
    4523             :   // TODO: Should this propagate fast-math-flags?
    4524             : 
    4525        6092 :   switch (IntrinsicID) {
    4526           4 :   case Intrinsic::amdgcn_implicit_buffer_ptr: {
    4527           4 :     if (getSubtarget()->isAmdCodeObjectV2(MF))
    4528           2 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4529             :     return getPreloadedValue(DAG, *MFI, VT,
    4530           2 :                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
    4531             :   }
    4532          56 :   case Intrinsic::amdgcn_dispatch_ptr:
    4533             :   case Intrinsic::amdgcn_queue_ptr: {
    4534          56 :     if (!Subtarget->isAmdCodeObjectV2(MF)) {
    4535             :       DiagnosticInfoUnsupported BadIntrin(
    4536             :           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
    4537           4 :           DL.getDebugLoc());
    4538           2 :       DAG.getContext()->diagnose(BadIntrin);
    4539           2 :       return DAG.getUNDEF(VT);
    4540             :     }
    4541             : 
    4542          54 :     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
    4543             :       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
    4544          54 :     return getPreloadedValue(DAG, *MFI, VT, RegID);
    4545             :   }
    4546          31 :   case Intrinsic::amdgcn_implicitarg_ptr: {
    4547          31 :     if (MFI->isEntryFunction())
    4548          25 :       return getImplicitArgPtr(DAG, DL);
    4549             :     return getPreloadedValue(DAG, *MFI, VT,
    4550           6 :                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    4551             :   }
    4552          27 :   case Intrinsic::amdgcn_kernarg_segment_ptr: {
    4553             :     return getPreloadedValue(DAG, *MFI, VT,
    4554          27 :                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    4555             :   }
    4556           9 :   case Intrinsic::amdgcn_dispatch_id: {
    4557           9 :     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
    4558             :   }
    4559             :   case Intrinsic::amdgcn_rcp:
    4560          20 :     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    4561             :   case Intrinsic::amdgcn_rsq:
    4562          32 :     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4563           5 :   case Intrinsic::amdgcn_rsq_legacy:
    4564           5 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4565           1 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4566             : 
    4567           4 :     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    4568          11 :   case Intrinsic::amdgcn_rcp_legacy:
    4569          11 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4570           4 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4571           7 :     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
    4572           6 :   case Intrinsic::amdgcn_rsq_clamp: {
    4573           6 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4574           3 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    4575             : 
    4576           3 :     Type *Type = VT.getTypeForEVT(*DAG.getContext());
    4577           3 :     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    4578           3 :     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    4579             : 
    4580           3 :     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4581             :     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    4582           3 :                               DAG.getConstantFP(Max, DL, VT));
    4583             :     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    4584           3 :                        DAG.getConstantFP(Min, DL, VT));
    4585             :   }
    4586           2 :   case Intrinsic::r600_read_ngroups_x:
    4587           2 :     if (Subtarget->isAmdHsaOS())
    4588           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4589             : 
    4590             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4591           2 :                                     SI::KernelInputOffsets::NGROUPS_X, false);
    4592           2 :   case Intrinsic::r600_read_ngroups_y:
    4593           2 :     if (Subtarget->isAmdHsaOS())
    4594           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4595             : 
    4596             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4597           2 :                                     SI::KernelInputOffsets::NGROUPS_Y, false);
    4598           2 :   case Intrinsic::r600_read_ngroups_z:
    4599           2 :     if (Subtarget->isAmdHsaOS())
    4600           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4601             : 
    4602             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4603           2 :                                     SI::KernelInputOffsets::NGROUPS_Z, false);
    4604           2 :   case Intrinsic::r600_read_global_size_x:
    4605           2 :     if (Subtarget->isAmdHsaOS())
    4606           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4607             : 
    4608             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4609           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
    4610           2 :   case Intrinsic::r600_read_global_size_y:
    4611           2 :     if (Subtarget->isAmdHsaOS())
    4612           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4613             : 
    4614             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4615           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
    4616           2 :   case Intrinsic::r600_read_global_size_z:
    4617           2 :     if (Subtarget->isAmdHsaOS())
    4618           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4619             : 
    4620             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4621           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
    4622          13 :   case Intrinsic::r600_read_local_size_x:
    4623          13 :     if (Subtarget->isAmdHsaOS())
    4624           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4625             : 
    4626             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4627          13 :                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
    4628          39 :   case Intrinsic::r600_read_local_size_y:
    4629          39 :     if (Subtarget->isAmdHsaOS())
    4630           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4631             : 
    4632             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4633          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
    4634          39 :   case Intrinsic::r600_read_local_size_z:
    4635          39 :     if (Subtarget->isAmdHsaOS())
    4636           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4637             : 
    4638             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4639          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
    4640          49 :   case Intrinsic::amdgcn_workgroup_id_x:
    4641             :   case Intrinsic::r600_read_tgid_x:
    4642             :     return getPreloadedValue(DAG, *MFI, VT,
    4643          49 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    4644          24 :   case Intrinsic::amdgcn_workgroup_id_y:
    4645             :   case Intrinsic::r600_read_tgid_y:
    4646             :     return getPreloadedValue(DAG, *MFI, VT,
    4647          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    4648          24 :   case Intrinsic::amdgcn_workgroup_id_z:
    4649             :   case Intrinsic::r600_read_tgid_z:
    4650             :     return getPreloadedValue(DAG, *MFI, VT,
    4651          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    4652        2893 :   case Intrinsic::amdgcn_workitem_id_x: {
    4653             :   case Intrinsic::r600_read_tidig_x:
    4654             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4655        2893 :                           SDLoc(DAG.getEntryNode()),
    4656        5786 :                           MFI->getArgInfo().WorkItemIDX);
    4657             :   }
    4658         116 :   case Intrinsic::amdgcn_workitem_id_y:
    4659             :   case Intrinsic::r600_read_tidig_y:
    4660             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4661         116 :                           SDLoc(DAG.getEntryNode()),
    4662         232 :                           MFI->getArgInfo().WorkItemIDY);
    4663          74 :   case Intrinsic::amdgcn_workitem_id_z:
    4664             :   case Intrinsic::r600_read_tidig_z:
    4665             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4666          74 :                           SDLoc(DAG.getEntryNode()),
    4667         148 :                           MFI->getArgInfo().WorkItemIDZ);
    4668         566 :   case AMDGPUIntrinsic::SI_load_const: {
    4669             :     SDValue Ops[] = {
    4670             :       Op.getOperand(1),
    4671             :       Op.getOperand(2)
    4672         566 :     };
    4673             : 
    4674        1132 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4675             :         MachinePointerInfo(),
    4676             :         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    4677             :             MachineMemOperand::MOInvariant,
    4678         566 :         VT.getStoreSize(), 4);
    4679             :     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
    4680         566 :                                    Op->getVTList(), Ops, VT, MMO);
    4681             :   }
    4682           4 :   case Intrinsic::amdgcn_fdiv_fast:
    4683           4 :     return lowerFDIV_FAST(Op, DAG);
    4684          83 :   case Intrinsic::amdgcn_interp_mov: {
    4685          83 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4686          83 :     SDValue Glue = M0.getValue(1);
    4687             :     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
    4688          83 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4689             :   }
    4690         204 :   case Intrinsic::amdgcn_interp_p1: {
    4691         204 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4692         204 :     SDValue Glue = M0.getValue(1);
    4693             :     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
    4694         204 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4695             :   }
    4696         188 :   case Intrinsic::amdgcn_interp_p2: {
    4697         188 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
    4698             :     SDValue Glue = SDValue(M0.getNode(), 1);
    4699             :     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
    4700             :                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
    4701         188 :                        Glue);
    4702             :   }
    4703             :   case Intrinsic::amdgcn_sin:
    4704           5 :     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
    4705             : 
    4706             :   case Intrinsic::amdgcn_cos:
    4707           3 :     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
    4708             : 
    4709           3 :   case Intrinsic::amdgcn_log_clamp: {
    4710           3 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4711           2 :       return SDValue();
    4712             : 
    4713             :     DiagnosticInfoUnsupported BadIntrin(
    4714             :       MF.getFunction(), "intrinsic not supported on subtarget",
    4715           2 :       DL.getDebugLoc());
    4716           1 :       DAG.getContext()->diagnose(BadIntrin);
    4717           1 :       return DAG.getUNDEF(VT);
    4718             :   }
    4719             :   case Intrinsic::amdgcn_ldexp:
    4720             :     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
    4721           9 :                        Op.getOperand(1), Op.getOperand(2));
    4722             : 
    4723             :   case Intrinsic::amdgcn_fract:
    4724           7 :     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    4725             : 
    4726             :   case Intrinsic::amdgcn_class:
    4727             :     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
    4728          59 :                        Op.getOperand(1), Op.getOperand(2));
    4729          10 :   case Intrinsic::amdgcn_div_fmas:
    4730             :     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    4731             :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    4732          10 :                        Op.getOperand(4));
    4733             : 
    4734          13 :   case Intrinsic::amdgcn_div_fixup:
    4735             :     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    4736          13 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4737             : 
    4738             :   case Intrinsic::amdgcn_trig_preop:
    4739             :     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    4740           4 :                        Op.getOperand(1), Op.getOperand(2));
    4741          27 :   case Intrinsic::amdgcn_div_scale: {
    4742             :     // 3rd parameter required to be a constant.
    4743             :     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4744             :     if (!Param)
    4745           9 :       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
    4746             : 
    4747             :     // Translate to the operands expected by the machine instruction. The
    4748             :     // first parameter must be the same as the first instruction.
    4749          24 :     SDValue Numerator = Op.getOperand(1);
    4750          24 :     SDValue Denominator = Op.getOperand(2);
    4751             : 
    4752             :     // Note this order is opposite of the machine instruction's operations,
    4753             :     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    4754             :     // intrinsic has the numerator as the first operand to match a normal
    4755             :     // division operation.
    4756             : 
    4757          24 :     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    4758             : 
    4759             :     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    4760          24 :                        Denominator, Numerator);
    4761             :   }
    4762          48 :   case Intrinsic::amdgcn_icmp: {
    4763             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4764             :     if (!CD)
    4765           6 :       return DAG.getUNDEF(VT);
    4766             : 
    4767             :     int CondCode = CD->getSExtValue();
    4768          42 :     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
    4769             :         CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
    4770           2 :       return DAG.getUNDEF(VT);
    4771             : 
    4772             :     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
    4773          40 :     ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
    4774             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4775          80 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4776             :   }
    4777          56 :   case Intrinsic::amdgcn_fcmp: {
    4778             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4779             :     if (!CD)
    4780           2 :       return DAG.getUNDEF(VT);
    4781             : 
    4782             :     int CondCode = CD->getSExtValue();
    4783          54 :     if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
    4784             :         CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
    4785           2 :       return DAG.getUNDEF(VT);
    4786             : 
    4787             :     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
    4788          52 :     ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
    4789             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4790         104 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4791             :   }
    4792          69 :   case Intrinsic::amdgcn_fmed3:
    4793             :     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
    4794          69 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4795             :   case Intrinsic::amdgcn_fmul_legacy:
    4796             :     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
    4797          31 :                        Op.getOperand(1), Op.getOperand(2));
    4798             :   case Intrinsic::amdgcn_sffbh:
    4799           4 :     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
    4800         102 :   case Intrinsic::amdgcn_sbfe:
    4801             :     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    4802         102 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4803          94 :   case Intrinsic::amdgcn_ubfe:
    4804             :     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    4805          94 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4806          38 :   case Intrinsic::amdgcn_cvt_pkrtz:
    4807             :   case Intrinsic::amdgcn_cvt_pknorm_i16:
    4808             :   case Intrinsic::amdgcn_cvt_pknorm_u16:
    4809             :   case Intrinsic::amdgcn_cvt_pk_i16:
    4810             :   case Intrinsic::amdgcn_cvt_pk_u16: {
    4811             :     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
    4812          38 :     EVT VT = Op.getValueType();
    4813             :     unsigned Opcode;
    4814             : 
    4815          38 :     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
    4816             :       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
    4817          28 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
    4818             :       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    4819          19 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
    4820             :       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    4821          10 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
    4822             :       Opcode = AMDGPUISD::CVT_PK_I16_I32;
    4823             :     else
    4824             :       Opcode = AMDGPUISD::CVT_PK_U16_U32;
    4825             : 
    4826             :     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
    4827          38 :                                Op.getOperand(1), Op.getOperand(2));
    4828          38 :     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    4829             :   }
    4830           8 :   case Intrinsic::amdgcn_wqm: {
    4831           8 :     SDValue Src = Op.getOperand(1);
    4832           8 :     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
    4833           8 :                    0);
    4834             :   }
    4835          14 :   case Intrinsic::amdgcn_wwm: {
    4836          14 :     SDValue Src = Op.getOperand(1);
    4837          14 :     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
    4838          14 :                    0);
    4839             :   }
    4840          26 :   case Intrinsic::amdgcn_image_getlod:
    4841             :   case Intrinsic::amdgcn_image_getresinfo: {
    4842          26 :     unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
    4843             : 
    4844             :     // Replace dmask with everything disabled with undef.
    4845             :     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
    4846          26 :     if (!DMask || DMask->isNullValue())
    4847           4 :       return DAG.getUNDEF(Op.getValueType());
    4848          22 :     return SDValue();
    4849             :   }
    4850         933 :   default:
    4851         933 :     return Op;
    4852             :   }
    4853             : }
    4854             : 
    4855        1370 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
    4856             :                                                  SelectionDAG &DAG) const {
    4857        1370 :   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    4858             :   SDLoc DL(Op);
    4859             : 
    4860        1370 :   switch (IntrID) {
    4861             :   case Intrinsic::amdgcn_atomic_inc:
    4862             :   case Intrinsic::amdgcn_atomic_dec:
    4863             :   case Intrinsic::amdgcn_ds_fadd:
    4864             :   case Intrinsic::amdgcn_ds_fmin:
    4865             :   case Intrinsic::amdgcn_ds_fmax: {
    4866             :     MemSDNode *M = cast<MemSDNode>(Op);
    4867             :     unsigned Opc;
    4868         245 :     switch (IntrID) {
    4869             :     case Intrinsic::amdgcn_atomic_inc:
    4870             :       Opc = AMDGPUISD::ATOMIC_INC;
    4871             :       break;
    4872         115 :     case Intrinsic::amdgcn_atomic_dec:
    4873             :       Opc = AMDGPUISD::ATOMIC_DEC;
    4874         115 :       break;
    4875           6 :     case Intrinsic::amdgcn_ds_fadd:
    4876             :       Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
    4877           6 :       break;
    4878           6 :     case Intrinsic::amdgcn_ds_fmin:
    4879             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
    4880           6 :       break;
    4881           6 :     case Intrinsic::amdgcn_ds_fmax:
    4882             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
    4883           6 :       break;
    4884           0 :     default:
    4885           0 :       llvm_unreachable("Unknown intrinsic!");
    4886             :     }
    4887             :     SDValue Ops[] = {
    4888             :       M->getOperand(0), // Chain
    4889             :       M->getOperand(2), // Ptr
    4890             :       M->getOperand(3)  // Value
    4891         245 :     };
    4892             : 
    4893         245 :     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
    4894         490 :                                    M->getMemoryVT(), M->getMemOperand());
    4895             :   }
    4896         173 :   case Intrinsic::amdgcn_buffer_load:
    4897             :   case Intrinsic::amdgcn_buffer_load_format: {
    4898             :     SDValue Ops[] = {
    4899             :       Op.getOperand(0), // Chain
    4900             :       Op.getOperand(2), // rsrc
    4901             :       Op.getOperand(3), // vindex
    4902             :       Op.getOperand(4), // offset
    4903             :       Op.getOperand(5), // glc
    4904             :       Op.getOperand(6)  // slc
    4905         173 :     };
    4906             : 
    4907         173 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
    4908             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    4909         173 :     EVT VT = Op.getValueType();
    4910         173 :     EVT IntVT = VT.changeTypeToInteger();
    4911             : 
    4912             :     auto *M = cast<MemSDNode>(Op);
    4913             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    4914         173 :                                    M->getMemOperand());
    4915             :   }
    4916             :   case Intrinsic::amdgcn_tbuffer_load: {
    4917             :     MemSDNode *M = cast<MemSDNode>(Op);
    4918             :     SDValue Ops[] = {
    4919             :       Op.getOperand(0),  // Chain
    4920             :       Op.getOperand(2),  // rsrc
    4921             :       Op.getOperand(3),  // vindex
    4922             :       Op.getOperand(4),  // voffset
    4923             :       Op.getOperand(5),  // soffset
    4924             :       Op.getOperand(6),  // offset
    4925             :       Op.getOperand(7),  // dfmt
    4926             :       Op.getOperand(8),  // nfmt
    4927             :       Op.getOperand(9),  // glc
    4928             :       Op.getOperand(10)   // slc
    4929          32 :     };
    4930             : 
    4931          32 :     EVT VT = Op.getValueType();
    4932             : 
    4933             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    4934          32 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    4935             :   }
    4936          34 :   case Intrinsic::amdgcn_buffer_atomic_swap:
    4937             :   case Intrinsic::amdgcn_buffer_atomic_add:
    4938             :   case Intrinsic::amdgcn_buffer_atomic_sub:
    4939             :   case Intrinsic::amdgcn_buffer_atomic_smin:
    4940             :   case Intrinsic::amdgcn_buffer_atomic_umin:
    4941             :   case Intrinsic::amdgcn_buffer_atomic_smax:
    4942             :   case Intrinsic::amdgcn_buffer_atomic_umax:
    4943             :   case Intrinsic::amdgcn_buffer_atomic_and:
    4944             :   case Intrinsic::amdgcn_buffer_atomic_or:
    4945             :   case Intrinsic::amdgcn_buffer_atomic_xor: {
    4946             :     SDValue Ops[] = {
    4947             :       Op.getOperand(0), // Chain
    4948             :       Op.getOperand(2), // vdata
    4949             :       Op.getOperand(3), // rsrc
    4950             :       Op.getOperand(4), // vindex
    4951             :       Op.getOperand(5), // offset
    4952             :       Op.getOperand(6)  // slc
    4953          34 :     };
    4954          34 :     EVT VT = Op.getValueType();
    4955             : 
    4956             :     auto *M = cast<MemSDNode>(Op);
    4957             :     unsigned Opcode = 0;
    4958             : 
    4959          34 :     switch (IntrID) {
    4960             :     case Intrinsic::amdgcn_buffer_atomic_swap:
    4961             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    4962             :       break;
    4963           4 :     case Intrinsic::amdgcn_buffer_atomic_add:
    4964             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    4965           4 :       break;
    4966           2 :     case Intrinsic::amdgcn_buffer_atomic_sub:
    4967             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    4968           2 :       break;
    4969           2 :     case Intrinsic::amdgcn_buffer_atomic_smin:
    4970             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    4971           2 :       break;
    4972           2 :     case Intrinsic::amdgcn_buffer_atomic_umin:
    4973             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    4974           2 :       break;
    4975           2 :     case Intrinsic::amdgcn_buffer_atomic_smax:
    4976             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    4977           2 :       break;
    4978           2 :     case Intrinsic::amdgcn_buffer_atomic_umax:
    4979             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    4980           2 :       break;
    4981           2 :     case Intrinsic::amdgcn_buffer_atomic_and:
    4982             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    4983           2 :       break;
    4984           2 :     case Intrinsic::amdgcn_buffer_atomic_or:
    4985             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    4986           2 :       break;
    4987           2 :     case Intrinsic::amdgcn_buffer_atomic_xor:
    4988             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    4989           2 :       break;
    4990           0 :     default:
    4991           0 :       llvm_unreachable("unhandled atomic opcode");
    4992             :     }
    4993             : 
    4994             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    4995          34 :                                    M->getMemOperand());
    4996             :   }
    4997             : 
    4998          12 :   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
    4999             :     SDValue Ops[] = {
    5000             :       Op.getOperand(0), // Chain
    5001             :       Op.getOperand(2), // src
    5002             :       Op.getOperand(3), // cmp
    5003             :       Op.getOperand(4), // rsrc
    5004             :       Op.getOperand(5), // vindex
    5005             :       Op.getOperand(6), // offset
    5006             :       Op.getOperand(7)  // slc
    5007          12 :     };
    5008          12 :     EVT VT = Op.getValueType();
    5009             :     auto *M = cast<MemSDNode>(Op);
    5010             : 
    5011             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5012          12 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5013             :   }
    5014             : 
    5015             :   // Basic sample.
    5016         542 :   case Intrinsic::amdgcn_image_sample:
    5017             :   case Intrinsic::amdgcn_image_sample_cl:
    5018             :   case Intrinsic::amdgcn_image_sample_d:
    5019             :   case Intrinsic::amdgcn_image_sample_d_cl:
    5020             :   case Intrinsic::amdgcn_image_sample_l:
    5021             :   case Intrinsic::amdgcn_image_sample_b:
    5022             :   case Intrinsic::amdgcn_image_sample_b_cl:
    5023             :   case Intrinsic::amdgcn_image_sample_lz:
    5024             :   case Intrinsic::amdgcn_image_sample_cd:
    5025             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    5026             : 
    5027             :   // Sample with comparison.
    5028             :   case Intrinsic::amdgcn_image_sample_c:
    5029             :   case Intrinsic::amdgcn_image_sample_c_cl:
    5030             :   case Intrinsic::amdgcn_image_sample_c_d:
    5031             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    5032             :   case Intrinsic::amdgcn_image_sample_c_l:
    5033             :   case Intrinsic::amdgcn_image_sample_c_b:
    5034             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    5035             :   case Intrinsic::amdgcn_image_sample_c_lz:
    5036             :   case Intrinsic::amdgcn_image_sample_c_cd:
    5037             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    5038             : 
    5039             :   // Sample with offsets.
    5040             :   case Intrinsic::amdgcn_image_sample_o:
    5041             :   case Intrinsic::amdgcn_image_sample_cl_o:
    5042             :   case Intrinsic::amdgcn_image_sample_d_o:
    5043             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    5044             :   case Intrinsic::amdgcn_image_sample_l_o:
    5045             :   case Intrinsic::amdgcn_image_sample_b_o:
    5046             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    5047             :   case Intrinsic::amdgcn_image_sample_lz_o:
    5048             :   case Intrinsic::amdgcn_image_sample_cd_o:
    5049             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    5050             : 
    5051             :   // Sample with comparison and offsets.
    5052             :   case Intrinsic::amdgcn_image_sample_c_o:
    5053             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    5054             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    5055             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    5056             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    5057             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    5058             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    5059             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    5060             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    5061             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
    5062             :     // Replace dmask with everything disabled with undef.
    5063             :     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
    5064         540 :     if (!DMask || DMask->isNullValue()) {
    5065          62 :       SDValue Undef = DAG.getUNDEF(Op.getValueType());
    5066         186 :       return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
    5067             :     }
    5068             : 
    5069         480 :     return SDValue();
    5070             :   }
    5071         332 :   default:
    5072         332 :     return SDValue();
    5073             :   }
    5074             : }
    5075             : 
    5076          30 : SDValue SITargetLowering::handleD16VData(SDValue VData,
    5077             :                                          SelectionDAG &DAG) const {
    5078          30 :   EVT StoreVT = VData.getValueType();
    5079             :   SDLoc DL(VData);
    5080             : 
    5081          30 :   if (StoreVT.isVector()) {
    5082             :     assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
    5083          21 :     if (!Subtarget->hasUnpackedD16VMem()) {
    5084             :       if (!isTypeLegal(StoreVT)) {
    5085             :         // If Target supports packed vmem, we just need to workaround
    5086             :         // the illegal type by casting to an equivalent one.
    5087          11 :         EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
    5088          11 :         return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
    5089             :       }
    5090             :     } else { // We need to unpack the packed data to store.
    5091           7 :       EVT IntStoreVT = StoreVT.changeTypeToInteger();
    5092           7 :       SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
    5093             :       EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
    5094           7 :       return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
    5095             :     }
    5096             :   }
    5097             :   // No change for f16 and legal vector D16 types.
    5098          12 :   return VData;
    5099             : }
    5100             : 
    5101        2099 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
    5102             :                                               SelectionDAG &DAG) const {
    5103             :   SDLoc DL(Op);
    5104        2099 :   SDValue Chain = Op.getOperand(0);
    5105        2099 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    5106             :   MachineFunction &MF = DAG.getMachineFunction();
    5107             : 
    5108        2099 :   switch (IntrinsicID) {
    5109         354 :   case Intrinsic::amdgcn_exp: {
    5110             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5111             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5112             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
    5113             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
    5114             : 
    5115             :     const SDValue Ops[] = {
    5116             :       Chain,
    5117             :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5118             :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5119             :       Op.getOperand(4), // src0
    5120             :       Op.getOperand(5), // src1
    5121             :       Op.getOperand(6), // src2
    5122             :       Op.getOperand(7), // src3
    5123             :       DAG.getTargetConstant(0, DL, MVT::i1), // compr
    5124             :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5125        1770 :     };
    5126             : 
    5127         354 :     unsigned Opc = Done->isNullValue() ?
    5128             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5129         354 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5130             :   }
    5131          93 :   case Intrinsic::amdgcn_exp_compr: {
    5132             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5133             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5134          93 :     SDValue Src0 = Op.getOperand(4);
    5135          93 :     SDValue Src1 = Op.getOperand(5);
    5136             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
    5137             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
    5138             : 
    5139          93 :     SDValue Undef = DAG.getUNDEF(MVT::f32);
    5140             :     const SDValue Ops[] = {
    5141             :       Chain,
    5142             :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5143             :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5144          93 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
    5145          93 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
    5146             :       Undef, // src2
    5147             :       Undef, // src3
    5148             :       DAG.getTargetConstant(1, DL, MVT::i1), // compr
    5149             :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5150         744 :     };
    5151             : 
    5152          93 :     unsigned Opc = Done->isNullValue() ?
    5153             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5154          93 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5155             :   }
    5156          24 :   case Intrinsic::amdgcn_s_sendmsg:
    5157             :   case Intrinsic::amdgcn_s_sendmsghalt: {
    5158          24 :     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
    5159             :       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
    5160          24 :     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    5161          24 :     SDValue Glue = Chain.getValue(1);
    5162             :     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
    5163          24 :                        Op.getOperand(2), Glue);
    5164             :   }
    5165             :   case Intrinsic::amdgcn_init_exec: {
    5166             :     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
    5167           2 :                        Op.getOperand(2));
    5168             :   }
    5169           4 :   case Intrinsic::amdgcn_init_exec_from_input: {
    5170             :     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
    5171           4 :                        Op.getOperand(2), Op.getOperand(3));
    5172             :   }
    5173          31 :   case AMDGPUIntrinsic::AMDGPU_kill: {
    5174          31 :     SDValue Src = Op.getOperand(2);
    5175             :     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
    5176          11 :       if (!K->isNegative())
    5177           4 :         return Chain;
    5178             : 
    5179           7 :       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
    5180           7 :       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
    5181             :     }
    5182             : 
    5183          20 :     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
    5184          20 :     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
    5185             :   }
    5186         149 :   case Intrinsic::amdgcn_s_barrier: {
    5187         149 :     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
    5188             :       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    5189         141 :       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
    5190         141 :       if (WGSize <= ST.getWavefrontSize())
    5191          10 :         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
    5192           5 :                                           Op.getOperand(0)), 0);
    5193             :     }
    5194         144 :     return SDValue();
    5195             :   };
    5196          14 :   case AMDGPUIntrinsic::SI_tbuffer_store: {
    5197             : 
    5198             :     // Extract vindex and voffset from vaddr as appropriate
    5199             :     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
    5200             :     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
    5201          14 :     SDValue VAddr = Op.getOperand(5);
    5202             : 
    5203          14 :     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
    5204             : 
    5205             :     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
    5206             :            "Legacy intrinsic doesn't support both offset and index - use new version");
    5207             : 
    5208          14 :     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
    5209          14 :     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
    5210             : 
    5211             :     // Deal with the vec-3 case
    5212             :     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
    5213          14 :     auto Opcode = NumChannels->getZExtValue() == 3 ?
    5214             :       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
    5215             : 
    5216             :     SDValue Ops[] = {
    5217             :      Chain,
    5218             :      Op.getOperand(3),  // vdata
    5219             :      Op.getOperand(2),  // rsrc
    5220             :      VIndex,
    5221             :      VOffset,
    5222             :      Op.getOperand(6),  // soffset
    5223             :      Op.getOperand(7),  // inst_offset
    5224             :      Op.getOperand(8),  // dfmt
    5225             :      Op.getOperand(9),  // nfmt
    5226             :      Op.getOperand(12), // glc
    5227             :      Op.getOperand(13), // slc
    5228          14 :     };
    5229             : 
    5230             :     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
    5231             :            "Value of tfe other than zero is unsupported");
    5232             : 
    5233          14 :     EVT VT = Op.getOperand(3).getValueType();
    5234          28 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    5235             :       MachinePointerInfo(),
    5236             :       MachineMemOperand::MOStore,
    5237          14 :       VT.getStoreSize(), 4);
    5238             :     return DAG.getMemIntrinsicNode(Opcode, DL,
    5239          14 :                                    Op->getVTList(), Ops, VT, MMO);
    5240             :   }
    5241             : 
    5242          41 :   case Intrinsic::amdgcn_tbuffer_store: {
    5243          41 :     SDValue VData = Op.getOperand(2);
    5244          82 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5245          41 :     if (IsD16)
    5246           9 :       VData = handleD16VData(VData, DAG);
    5247             :     SDValue Ops[] = {
    5248             :       Chain,
    5249             :       VData,             // vdata
    5250             :       Op.getOperand(3),  // rsrc
    5251             :       Op.getOperand(4),  // vindex
    5252             :       Op.getOperand(5),  // voffset
    5253             :       Op.getOperand(6),  // soffset
    5254             :       Op.getOperand(7),  // offset
    5255             :       Op.getOperand(8),  // dfmt
    5256             :       Op.getOperand(9),  // nfmt
    5257             :       Op.getOperand(10), // glc
    5258             :       Op.getOperand(11)  // slc
    5259          41 :     };
    5260          41 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5261             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5262             :     MemSDNode *M = cast<MemSDNode>(Op);
    5263             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5264          41 :                                    M->getMemoryVT(), M->getMemOperand());
    5265             :   }
    5266             : 
    5267         139 :   case Intrinsic::amdgcn_buffer_store:
    5268             :   case Intrinsic::amdgcn_buffer_store_format: {
    5269         139 :     SDValue VData = Op.getOperand(2);
    5270         278 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5271         139 :     if (IsD16)
    5272           9 :       VData = handleD16VData(VData, DAG);
    5273             :     SDValue Ops[] = {
    5274             :       Chain,
    5275             :       VData,            // vdata
    5276             :       Op.getOperand(3), // rsrc
    5277             :       Op.getOperand(4), // vindex
    5278             :       Op.getOperand(5), // offset
    5279             :       Op.getOperand(6), // glc
    5280             :       Op.getOperand(7)  // slc
    5281         139 :     };
    5282         139 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
    5283             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5284         139 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5285             :     MemSDNode *M = cast<MemSDNode>(Op);
    5286             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5287         139 :                                    M->getMemoryVT(), M->getMemOperand());
    5288             :   }
    5289             : 
    5290          40 :   case Intrinsic::amdgcn_image_store:
    5291             :   case Intrinsic::amdgcn_image_store_mip: {
    5292          40 :     SDValue VData = Op.getOperand(2);
    5293          68 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5294             :     if (IsD16)
    5295          12 :       VData = handleD16VData(VData, DAG);
    5296             :     SDValue Ops[] = {
    5297             :       Chain, // Chain
    5298             :       VData, // vdata
    5299             :       Op.getOperand(3), // vaddr
    5300             :       Op.getOperand(4), // rsrc
    5301             :       Op.getOperand(5), // dmask
    5302             :       Op.getOperand(6), // glc
    5303             :       Op.getOperand(7), // slc
    5304             :       Op.getOperand(8), // lwe
    5305             :       Op.getOperand(9)  // da
    5306          40 :     };
    5307          40 :     unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ?
    5308             :                   AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
    5309             :     MemSDNode *M = cast<MemSDNode>(Op);
    5310             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5311          40 :                                    M->getMemoryVT(), M->getMemOperand());
    5312             :   }
    5313             : 
    5314        1208 :   default:
    5315        1208 :     return Op;
    5316             :   }
    5317             : }
    5318             : 
    5319       80641 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    5320             :   SDLoc DL(Op);
    5321             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    5322             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    5323       80641 :   EVT MemVT = Load->getMemoryVT();
    5324             : 
    5325       80641 :   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
    5326             :     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
    5327        3196 :       return SDValue();
    5328             : 
    5329             :     // FIXME: Copied from PPC
    5330             :     // First, load into 32 bits, then truncate to 1 bit.
    5331             : 
    5332         275 :     SDValue Chain = Load->getChain();
    5333         275 :     SDValue BasePtr = Load->getBasePtr();
    5334             :     MachineMemOperand *MMO = Load->getMemOperand();
    5335             : 
    5336             :     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
    5337             : 
    5338             :     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
    5339         275 :                                    BasePtr, RealMemVT, MMO);
    5340             : 
    5341             :     SDValue Ops[] = {
    5342         275 :       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
    5343             :       NewLD.getValue(1)
    5344         550 :     };
    5345             : 
    5346         275 :     return DAG.getMergeValues(Ops, DL);
    5347             :   }
    5348             : 
    5349       77170 :   if (!MemVT.isVector())
    5350           0 :     return SDValue();
    5351             : 
    5352             :   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
    5353             :          "Custom lowering for non-i32 vectors hasn't been implemented.");
    5354             : 
    5355             :   unsigned AS = Load->getAddressSpace();
    5356      154340 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
    5357             :                           AS, Load->getAlignment())) {
    5358           2 :     SDValue Ops[2];
    5359           4 :     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
    5360           2 :     return DAG.getMergeValues(Ops, DL);
    5361             :   }
    5362             : 
    5363             :   MachineFunction &MF = DAG.getMachineFunction();
    5364       77168 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5365             :   // If there is a possibilty that flat instruction access scratch memory
    5366             :   // then we need to use the same legalization rules we use for private.
    5367       77168 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5368          24 :     AS = MFI->hasFlatScratchInit() ?
    5369             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5370             : 
    5371       77168 :   unsigned NumElements = MemVT.getVectorNumElements();
    5372      154336 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5373       77168 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
    5374       55564 :     if (isMemOpUniform(Load))
    5375       55337 :       return SDValue();
    5376             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5377             :     // have the same legalization requirements as global and private
    5378             :     // loads.
    5379             :     //
    5380             :   }
    5381       21831 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5382       21604 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5383             :       AS == AMDGPUASI.GLOBAL_ADDRESS) {
    5384       21901 :     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
    5385       14651 :         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
    5386         640 :       return SDValue();
    5387             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5388             :     // have the same legalization requirements as global and private
    5389             :     // loads.
    5390             :     //
    5391             :   }
    5392       21191 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5393       20964 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5394        8945 :       AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5395        8945 :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5396       12246 :     if (NumElements > 4)
    5397        1197 :       return SplitVectorLoad(Op, DAG);
    5398             :     // v4 loads are supported for private and global memory.
    5399       11049 :     return SDValue();
    5400             :   }
    5401        8945 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    5402             :     // Depending on the setting of the private_element_size field in the
    5403             :     // resource descriptor, we can only make private accesses up to a certain
    5404             :     // size.
    5405         385 :     switch (Subtarget->getMaxPrivateElementSize()) {
    5406         212 :     case 4:
    5407         212 :       return scalarizeVectorLoad(Load, DAG);
    5408          53 :     case 8:
    5409          53 :       if (NumElements > 2)
    5410           5 :         return SplitVectorLoad(Op, DAG);
    5411          48 :       return SDValue();
    5412         120 :     case 16:
    5413             :       // Same as global/flat
    5414         120 :       if (NumElements > 4)
    5415           1 :         return SplitVectorLoad(Op, DAG);
    5416         119 :       return SDValue();
    5417           0 :     default:
    5418           0 :       llvm_unreachable("unsupported private_element_size");
    5419             :     }
    5420        8560 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    5421        8560 :     if (NumElements > 2)
    5422         873 :       return SplitVectorLoad(Op, DAG);
    5423             : 
    5424        7687 :     if (NumElements == 2)
    5425        7687 :       return SDValue();
    5426             : 
    5427             :     // If properly aligned, if we split we might be able to use ds_read_b64.
    5428           0 :     return SplitVectorLoad(Op, DAG);
    5429             :   }
    5430           0 :   return SDValue();
    5431             : }
    5432             : 
    5433         638 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    5434             :   if (Op.getValueType() != MVT::i64)
    5435           0 :     return SDValue();
    5436             : 
    5437             :   SDLoc DL(Op);
    5438         638 :   SDValue Cond = Op.getOperand(0);
    5439             : 
    5440         638 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    5441         638 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    5442             : 
    5443         638 :   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
    5444         638 :   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
    5445             : 
    5446         638 :   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
    5447         638 :   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
    5448             : 
    5449         638 :   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
    5450             : 
    5451         638 :   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
    5452         638 :   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
    5453             : 
    5454         638 :   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
    5455             : 
    5456        1276 :   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
    5457         638 :   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
    5458             : }
    5459             : 
    5460             : // Catch division cases where we can use shortcuts with rcp and rsq
    5461             : // instructions.
    5462         205 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
    5463             :                                               SelectionDAG &DAG) const {
    5464             :   SDLoc SL(Op);
    5465         205 :   SDValue LHS = Op.getOperand(0);
    5466         205 :   SDValue RHS = Op.getOperand(1);
    5467         205 :   EVT VT = Op.getValueType();
    5468         205 :   const SDNodeFlags Flags = Op->getFlags();
    5469         193 :   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
    5470         369 :                 Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
    5471             : 
    5472         102 :   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
    5473           9 :     return SDValue();
    5474             : 
    5475             :   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    5476          69 :     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    5477          69 :       if (CLHS->isExactlyValue(1.0)) {
    5478             :         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
    5479             :         // the CI documentation has a worst case error of 1 ulp.
    5480             :         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
    5481             :         // use it as long as we aren't trying to use denormals.
    5482             :         //
    5483             :         // v_rcp_f16 and v_rsq_f16 DO support denormals.
    5484             : 
    5485             :         // 1.0 / sqrt(x) -> rsq(x)
    5486             : 
    5487             :         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
    5488             :         // error seems really high at 2^29 ULP.
    5489          53 :         if (RHS.getOpcode() == ISD::FSQRT)
    5490           6 :           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
    5491             : 
    5492             :         // 1.0 / x -> rcp(x)
    5493          47 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5494             :       }
    5495             : 
    5496             :       // Same as for 1.0, but expand the sign out of the constant.
    5497          16 :       if (CLHS->isExactlyValue(-1.0)) {
    5498             :         // -1.0 / x -> rcp (fneg x)
    5499          16 :         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    5500          16 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    5501             :       }
    5502             :     }
    5503             :   }
    5504             : 
    5505         127 :   if (Unsafe) {
    5506             :     // Turn into multiply by the reciprocal.
    5507             :     // x / y -> x * (1.0 / y)
    5508          80 :     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5509          80 :     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
    5510             :   }
    5511             : 
    5512          47 :   return SDValue();
    5513             : }
    5514             : 
    5515          54 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5516             :                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
    5517          54 :   if (GlueChain->getNumValues() <= 1) {
    5518           9 :     return DAG.getNode(Opcode, SL, VT, A, B);
    5519             :   }
    5520             : 
    5521             :   assert(GlueChain->getNumValues() == 3);
    5522             : 
    5523          45 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5524          45 :   switch (Opcode) {
    5525           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5526          45 :   case ISD::FMUL:
    5527             :     Opcode = AMDGPUISD::FMUL_W_CHAIN;
    5528             :     break;
    5529             :   }
    5530             : 
    5531             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
    5532          45 :                      GlueChain.getValue(2));
    5533             : }
    5534             : 
    5535         270 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5536             :                            EVT VT, SDValue A, SDValue B, SDValue C,
    5537             :                            SDValue GlueChain) {
    5538         270 :   if (GlueChain->getNumValues() <= 1) {
    5539          45 :     return DAG.getNode(Opcode, SL, VT, A, B, C);
    5540             :   }
    5541             : 
    5542             :   assert(GlueChain->getNumValues() == 3);
    5543             : 
    5544         225 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5545         225 :   switch (Opcode) {
    5546           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5547         225 :   case ISD::FMA:
    5548             :     Opcode = AMDGPUISD::FMA_W_CHAIN;
    5549             :     break;
    5550             :   }
    5551             : 
    5552             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
    5553         225 :                      GlueChain.getValue(2));
    5554             : }
    5555             : 
    5556          24 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
    5557          24 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5558          22 :     return FastLowered;
    5559             : 
    5560             :   SDLoc SL(Op);
    5561           2 :   SDValue Src0 = Op.getOperand(0);
    5562           2 :   SDValue Src1 = Op.getOperand(1);
    5563             : 
    5564           2 :   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    5565           2 :   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    5566             : 
    5567           2 :   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
    5568           2 :   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
    5569             : 
    5570           2 :   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
    5571           2 :   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
    5572             : 
    5573           2 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
    5574             : }
    5575             : 
    5576             : // Faster 2.5 ULP division that does not support denormals.
    5577           4 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
    5578             :   SDLoc SL(Op);
    5579           4 :   SDValue LHS = Op.getOperand(1);
    5580           4 :   SDValue RHS = Op.getOperand(2);
    5581             : 
    5582           4 :   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
    5583             : 
    5584           4 :   const APFloat K0Val(BitsToFloat(0x6f800000));
    5585           4 :   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
    5586             : 
    5587           4 :   const APFloat K1Val(BitsToFloat(0x2f800000));
    5588           4 :   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
    5589             : 
    5590           4 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5591             : 
    5592             :   EVT SetCCVT =
    5593           4 :     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
    5594             : 
    5595           4 :   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
    5596             : 
    5597           4 :   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
    5598             : 
    5599             :   // TODO: Should this propagate fast-math-flags?
    5600           4 :   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
    5601             : 
    5602             :   // rcp does not support denormals.
    5603           4 :   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
    5604             : 
    5605           4 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
    5606             : 
    5607           8 :   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
    5608             : }
    5609             : 
    5610         174 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
    5611         174 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5612         120 :     return FastLowered;
    5613             : 
    5614             :   SDLoc SL(Op);
    5615          54 :   SDValue LHS = Op.getOperand(0);
    5616          54 :   SDValue RHS = Op.getOperand(1);
    5617             : 
    5618          54 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5619             : 
    5620          54 :   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
    5621             : 
    5622             :   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5623          54 :                                           RHS, RHS, LHS);
    5624             :   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5625          54 :                                         LHS, RHS, LHS);
    5626             : 
    5627             :   // Denominator is scaled to not be denormal, so using rcp is ok.
    5628             :   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
    5629          54 :                                   DenominatorScaled);
    5630             :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
    5631          54 :                                      DenominatorScaled);
    5632             : 
    5633             :   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
    5634             :                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
    5635             :                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
    5636             : 
    5637          54 :   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
    5638             : 
    5639          54 :   if (!Subtarget->hasFP32Denormals()) {
    5640          45 :     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
    5641             :     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
    5642          45 :                                                       SL, MVT::i32);
    5643             :     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
    5644             :                                        DAG.getEntryNode(),
    5645          45 :                                        EnableDenormValue, BitField);
    5646             :     SDValue Ops[3] = {
    5647             :       NegDivScale0,
    5648             :       EnableDenorm.getValue(0),
    5649             :       EnableDenorm.getValue(1)
    5650          45 :     };
    5651             : 
    5652          45 :     NegDivScale0 = DAG.getMergeValues(Ops, SL);
    5653             :   }
    5654             : 
    5655             :   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
    5656          54 :                              ApproxRcp, One, NegDivScale0);
    5657             : 
    5658             :   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
    5659          54 :                              ApproxRcp, Fma0);
    5660             : 
    5661             :   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
    5662          54 :                            Fma1, Fma1);
    5663             : 
    5664             :   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
    5665          54 :                              NumeratorScaled, Mul);
    5666             : 
    5667          54 :   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
    5668             : 
    5669             :   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
    5670          54 :                              NumeratorScaled, Fma3);
    5671             : 
    5672          54 :   if (!Subtarget->hasFP32Denormals()) {
    5673             :     const SDValue DisableDenormValue =
    5674          45 :         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
    5675             :     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
    5676             :                                         Fma4.getValue(1),
    5677             :                                         DisableDenormValue,
    5678             :                                         BitField,
    5679          45 :                                         Fma4.getValue(2));
    5680             : 
    5681             :     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    5682          45 :                                       DisableDenorm, DAG.getRoot());
    5683          45 :     DAG.setRoot(OutputChain);
    5684             :   }
    5685             : 
    5686          54 :   SDValue Scale = NumeratorScaled.getValue(1);
    5687             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
    5688          54 :                              Fma4, Fma1, Fma3, Scale);
    5689             : 
    5690          54 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
    5691             : }
    5692             : 
    5693          68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
    5694          68 :   if (DAG.getTarget().Options.UnsafeFPMath)
    5695           7 :     return lowerFastUnsafeFDIV(Op, DAG);
    5696             : 
    5697             :   SDLoc SL(Op);
    5698          61 :   SDValue X = Op.getOperand(0);
    5699          61 :   SDValue Y = Op.getOperand(1);
    5700             : 
    5701          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    5702             : 
    5703          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
    5704             : 
    5705          61 :   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
    5706             : 
    5707          61 :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
    5708             : 
    5709          61 :   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
    5710             : 
    5711          61 :   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
    5712             : 
    5713          61 :   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
    5714             : 
    5715          61 :   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
    5716             : 
    5717          61 :   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
    5718             : 
    5719          61 :   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
    5720          61 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
    5721             : 
    5722             :   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
    5723          61 :                              NegDivScale0, Mul, DivScale1);
    5724             : 
    5725          61 :   SDValue Scale;
    5726             : 
    5727          61 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
    5728             :     // Workaround a hardware bug on SI where the condition output from div_scale
    5729             :     // is not usable.
    5730             : 
    5731          23 :     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
    5732             : 
    5733             :     // Figure out if the scale to use for div_fmas.
    5734          23 :     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    5735          23 :     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
    5736          23 :     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
    5737          23 :     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
    5738             : 
    5739          23 :     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
    5740          23 :     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
    5741             : 
    5742             :     SDValue Scale0Hi
    5743          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
    5744             :     SDValue Scale1Hi
    5745          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
    5746             : 
    5747          23 :     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
    5748          23 :     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
    5749          23 :     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
    5750             :   } else {
    5751          38 :     Scale = DivScale1.getValue(1);
    5752             :   }
    5753             : 
    5754             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
    5755          61 :                              Fma4, Fma3, Mul, Scale);
    5756             : 
    5757          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
    5758             : }
    5759             : 
    5760         266 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
    5761             :   EVT VT = Op.getValueType();
    5762             : 
    5763             :   if (VT == MVT::f32)
    5764         174 :     return LowerFDIV32(Op, DAG);
    5765             : 
    5766             :   if (VT == MVT::f64)
    5767          68 :     return LowerFDIV64(Op, DAG);
    5768             : 
    5769             :   if (VT == MVT::f16)
    5770          24 :     return LowerFDIV16(Op, DAG);
    5771             : 
    5772           0 :   llvm_unreachable("Unexpected type for fdiv");
    5773             : }
    5774             : 
    5775       72958 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    5776             :   SDLoc DL(Op);
    5777             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    5778       72958 :   EVT VT = Store->getMemoryVT();
    5779             : 
    5780             :   if (VT == MVT::i1) {
    5781             :     return DAG.getTruncStore(Store->getChain(), DL,
    5782             :        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
    5783         378 :        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    5784             :   }
    5785             : 
    5786             :   assert(VT.isVector() &&
    5787             :          Store->getValue().getValueType().getScalarType() == MVT::i32);
    5788             : 
    5789             :   unsigned AS = Store->getAddressSpace();
    5790      145538 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
    5791             :                           AS, Store->getAlignment())) {
    5792          26 :     return expandUnalignedStore(Store, DAG);
    5793             :   }
    5794             : 
    5795             :   MachineFunction &MF = DAG.getMachineFunction();
    5796       72743 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5797             :   // If there is a possibilty that flat instruction access scratch memory
    5798             :   // then we need to use the same legalization rules we use for private.
    5799       72743 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5800         263 :     AS = MFI->hasFlatScratchInit() ?
    5801             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5802             : 
    5803       72743 :   unsigned NumElements = VT.getVectorNumElements();
    5804       72743 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5805             :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5806       39854 :     if (NumElements > 4)
    5807        3826 :       return SplitVectorStore(Op, DAG);
    5808       36028 :     return SDValue();
    5809       32889 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    5810         595 :     switch (Subtarget->getMaxPrivateElementSize()) {
    5811         315 :     case 4:
    5812         315 :       return scalarizeVectorStore(Store, DAG);
    5813         126 :     case 8:
    5814         126 :       if (NumElements > 2)
    5815          10 :         return SplitVectorStore(Op, DAG);
    5816         116 :       return SDValue();
    5817         154 :     case 16:
    5818         154 :       if (NumElements > 4)
    5819           2 :         return SplitVectorStore(Op, DAG);
    5820         152 :       return SDValue();
    5821           0 :     default:
    5822           0 :       llvm_unreachable("unsupported private_element_size");
    5823             :     }
    5824       32294 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    5825       32294 :     if (NumElements > 2)
    5826        3064 :       return SplitVectorStore(Op, DAG);
    5827             : 
    5828       29230 :     if (NumElements == 2)
    5829       29230 :       return Op;
    5830             : 
    5831             :     // If properly aligned, if we split we might be able to use ds_write_b64.
    5832           0 :     return SplitVectorStore(Op, DAG);
    5833             :   } else {
    5834           0 :     llvm_unreachable("unhandled address space");
    5835             :   }
    5836             : }
    5837             : 
    5838          51 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    5839             :   SDLoc DL(Op);
    5840          51 :   EVT VT = Op.getValueType();
    5841          51 :   SDValue Arg = Op.getOperand(0);
    5842             :   // TODO: Should this propagate fast-math-flags?
    5843             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    5844             :                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
    5845             :                                               DAG.getConstantFP(0.5/M_PI, DL,
    5846          51 :                                                                 VT)));
    5847             : 
    5848          51 :   switch (Op.getOpcode()) {
    5849             :   case ISD::FCOS:
    5850          48 :     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
    5851             :   case ISD::FSIN:
    5852          54 :     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
    5853           0 :   default:
    5854           0 :     llvm_unreachable("Wrong trig opcode");
    5855             :   }
    5856             : }
    5857             : 
    5858         261 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    5859             :   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
    5860             :   assert(AtomicNode->isCompareAndSwap());
    5861             :   unsigned AS = AtomicNode->getAddressSpace();
    5862             : 
    5863             :   // No custom lowering required for local address space
    5864             :   if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
    5865          66 :     return Op;
    5866             : 
    5867             :   // Non-local address space requires custom lowering for atomic compare
    5868             :   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
    5869             :   SDLoc DL(Op);
    5870         195 :   SDValue ChainIn = Op.getOperand(0);
    5871         195 :   SDValue Addr = Op.getOperand(1);
    5872         195 :   SDValue Old = Op.getOperand(2);
    5873         195 :   SDValue New = Op.getOperand(3);
    5874         195 :   EVT VT = Op.getValueType();
    5875         195 :   MVT SimpleVT = VT.getSimpleVT();
    5876         195 :   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
    5877             : 
    5878         390 :   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
    5879         195 :   SDValue Ops[] = { ChainIn, Addr, NewOld };
    5880             : 
    5881             :   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
    5882         195 :                                  Ops, VT, AtomicNode->getMemOperand());
    5883             : }
    5884             : 
    5885             : //===----------------------------------------------------------------------===//
    5886             : // Custom DAG optimizations
    5887             : //===----------------------------------------------------------------------===//
    5888             : 
    5889        1112 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
    5890             :                                                      DAGCombinerInfo &DCI) const {
    5891        1112 :   EVT VT = N->getValueType(0);
    5892        1112 :   EVT ScalarVT = VT.getScalarType();
    5893             :   if (ScalarVT != MVT::f32)
    5894         234 :     return SDValue();
    5895             : 
    5896         878 :   SelectionDAG &DAG = DCI.DAG;
    5897             :   SDLoc DL(N);
    5898             : 
    5899         878 :   SDValue Src = N->getOperand(0);
    5900             :   EVT SrcVT = Src.getValueType();
    5901             : 
    5902             :   // TODO: We could try to match extracting the higher bytes, which would be
    5903             :   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
    5904             :   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
    5905             :   // about in practice.
    5906         878 :   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
    5907         800 :     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
    5908         106 :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
    5909         106 :       DCI.AddToWorklist(Cvt.getNode());
    5910         106 :       return Cvt;
    5911             :     }
    5912             :   }
    5913             : 
    5914         772 :   return SDValue();
    5915             : }
    5916             : 
    5917             : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
    5918             : 
    5919             : // This is a variant of
    5920             : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
    5921             : //
    5922             : // The normal DAG combiner will do this, but only if the add has one use since
    5923             : // that would increase the number of instructions.
    5924             : //
    5925             : // This prevents us from seeing a constant offset that can be folded into a
    5926             : // memory instruction's addressing mode. If we know the resulting add offset of
    5927             : // a pointer can be folded into an addressing offset, we can replace the pointer
    5928             : // operand with the add of new constant offset. This eliminates one of the uses,
    5929             : // and may allow the remaining use to also be simplified.
    5930             : //
    5931         202 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    5932             :                                                unsigned AddrSpace,
    5933             :                                                EVT MemVT,
    5934             :                                                DAGCombinerInfo &DCI) const {
    5935         202 :   SDValue N0 = N->getOperand(0);
    5936         202 :   SDValue N1 = N->getOperand(1);
    5937             : 
    5938             :   // We only do this to handle cases where it's profitable when there are
    5939             :   // multiple uses of the add, so defer to the standard combine.
    5940         202 :   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
    5941             :       N0->hasOneUse())
    5942         156 :     return SDValue();
    5943             : 
    5944             :   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
    5945             :   if (!CN1)
    5946           0 :     return SDValue();
    5947             : 
    5948             :   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    5949             :   if (!CAdd)
    5950           2 :     return SDValue();
    5951             : 
    5952             :   // If the resulting offset is too large, we can't fold it into the addressing
    5953             :   // mode offset.
    5954             :   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    5955          44 :   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
    5956             : 
    5957          44 :   AddrMode AM;
    5958          44 :   AM.HasBaseReg = true;
    5959          44 :   AM.BaseOffs = Offset.getSExtValue();
    5960          88 :   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
    5961          14 :     return SDValue();
    5962             : 
    5963          30 :   SelectionDAG &DAG = DCI.DAG;
    5964             :   SDLoc SL(N);
    5965          30 :   EVT VT = N->getValueType(0);
    5966             : 
    5967          30 :   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
    5968          30 :   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
    5969             : 
    5970             :   SDNodeFlags Flags;
    5971          30 :   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
    5972           0 :                           (N0.getOpcode() == ISD::OR ||
    5973           0 :                            N0->getFlags().hasNoUnsignedWrap()));
    5974             : 
    5975          30 :   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
    5976             : }
    5977             : 
    5978      339815 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
    5979             :                                                   DAGCombinerInfo &DCI) const {
    5980      339815 :   SDValue Ptr = N->getBasePtr();
    5981      339815 :   SelectionDAG &DAG = DCI.DAG;
    5982             :   SDLoc SL(N);
    5983             : 
    5984             :   // TODO: We could also do this for multiplies.
    5985      339815 :   if (Ptr.getOpcode() == ISD::SHL) {
    5986             :     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
    5987         202 :                                           N->getMemoryVT(), DCI);
    5988         202 :     if (NewPtr) {
    5989             :       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
    5990             : 
    5991          60 :       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
    5992          30 :       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
    5993             :     }
    5994             :   }
    5995             : 
    5996      339785 :   return SDValue();
    5997             : }
    5998             : 
    5999             : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
    6000        4260 :   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
    6001        3584 :          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
    6002        1830 :          (Opc == ISD::XOR && Val == 0);
    6003             : }
    6004             : 
    6005             : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
    6006             : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
    6007             : // integer combine opportunities since most 64-bit operations are decomposed
    6008             : // this way.  TODO: We won't want this for SALU especially if it is an inline
    6009             : // immediate.
    6010        1754 : SDValue SITargetLowering::splitBinaryBitConstantOp(
    6011             :   DAGCombinerInfo &DCI,
    6012             :   const SDLoc &SL,
    6013             :   unsigned Opc, SDValue LHS,
    6014             :   const ConstantSDNode *CRHS) const {
    6015             :   uint64_t Val = CRHS->getZExtValue();
    6016             :   uint32_t ValLo = Lo_32(Val);
    6017             :   uint32_t ValHi = Hi_32(Val);
    6018        1754 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6019             : 
    6020             :     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
    6021             :          bitOpWithConstantIsReducible(Opc, ValHi)) ||
    6022         140 :         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
    6023             :     // If we need to materialize a 64-bit immediate, it will be split up later
    6024             :     // anyway. Avoid creating the harder to understand 64-bit immediate
    6025             :     // materialization.
    6026        1302 :     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
    6027             :   }
    6028             : 
    6029         452 :   return SDValue();
    6030             : }
    6031             : 
    6032             : // Returns true if argument is a boolean value which is not serialized into
    6033             : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
    6034             : static bool isBoolSGPR(SDValue V) {
    6035             :   if (V.getValueType() != MVT::i1)
    6036             :     return false;
    6037          49 :   switch (V.getOpcode()) {
    6038             :   default: break;
    6039             :   case ISD::SETCC:
    6040             :   case ISD::AND:
    6041             :   case ISD::OR:
    6042             :   case ISD::XOR:
    6043             :   case AMDGPUISD::FP_CLASS:
    6044             :     return true;
    6045             :   }
    6046             :   return false;
    6047             : }
    6048             : 
    6049       28952 : SDValue SITargetLowering::performAndCombine(SDNode *N,
    6050             :                                             DAGCombinerInfo &DCI) const {
    6051       28952 :   if (DCI.isBeforeLegalize())
    6052         810 :     return SDValue();
    6053             : 
    6054       28142 :   SelectionDAG &DAG = DCI.DAG;
    6055       28142 :   EVT VT = N->getValueType(0);
    6056       28142 :   SDValue LHS = N->getOperand(0);
    6057       28142 :   SDValue RHS = N->getOperand(1);
    6058             : 
    6059             : 
    6060             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6061        1515 :   if (VT == MVT::i64 && CRHS) {
    6062        1305 :     if (SDValue Split
    6063        2610 :         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
    6064        1225 :       return Split;
    6065             :   }
    6066             : 
    6067       26917 :   if (CRHS && VT == MVT::i32) {
    6068             :     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
    6069             :     // nb = number of trailing zeroes in mask
    6070             :     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
    6071             :     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
    6072             :     uint64_t Mask = CRHS->getZExtValue();
    6073             :     unsigned Bits = countPopulation(Mask);
    6074       29471 :     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
    6075       23756 :         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
    6076             :       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
    6077          71 :         unsigned Shift = CShift->getZExtValue();
    6078          71 :         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
    6079          71 :         unsigned Offset = NB + Shift;
    6080          71 :         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
    6081             :           SDLoc SL(N);
    6082             :           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    6083             :                                     LHS->getOperand(0),
    6084             :                                     DAG.getConstant(Offset, SL, MVT::i32),
    6085         213 :                                     DAG.getConstant(Bits, SL, MVT::i32));
    6086          71 :           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
    6087             :           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
    6088          71 :                                     DAG.getValueType(NarrowVT));
    6089          71 :           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
    6090         213 :                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
    6091          71 :           return Shl;
    6092             :         }
    6093             :       }
    6094             :     }
    6095             :   }
    6096             : 
    6097             :   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
    6098             :   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
    6099       26846 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
    6100             :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    6101             :     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
    6102             : 
    6103         144 :     SDValue X = LHS.getOperand(0);
    6104         144 :     SDValue Y = RHS.getOperand(0);
    6105         144 :     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
    6106         138 :       return SDValue();
    6107             : 
    6108           6 :     if (LCC == ISD::SETO) {
    6109             :       if (X != LHS.getOperand(1))
    6110           0 :         return SDValue();
    6111             : 
    6112           4 :       if (RCC == ISD::SETUNE) {
    6113             :         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
    6114           4 :         if (!C1 || !C1->isInfinity() || C1->isNegative())
    6115           0 :           return SDValue();
    6116             : 
    6117             :         const uint32_t Mask = SIInstrFlags::N_NORMAL |
    6118             :                               SIInstrFlags::N_SUBNORMAL |
    6119             :                               SIInstrFlags::N_ZERO |
    6120             :                               SIInstrFlags::P_ZERO |
    6121             :                               SIInstrFlags::P_SUBNORMAL |
    6122             :                               SIInstrFlags::P_NORMAL;
    6123             : 
    6124             :         static_assert(((~(SIInstrFlags::S_NAN |
    6125             :                           SIInstrFlags::Q_NAN |
    6126             :                           SIInstrFlags::N_INFINITY |
    6127             :                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
    6128             :                       "mask not equal");
    6129             : 
    6130             :         SDLoc DL(N);
    6131             :         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6132           4 :                            X, DAG.getConstant(Mask, DL, MVT::i32));
    6133             :       }
    6134             :     }
    6135             :   }
    6136             : 
    6137       22043 :   if (VT == MVT::i32 &&
    6138       22042 :       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
    6139             :     // and x, (sext cc from i1) => select cc, x, 0
    6140          24 :     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
    6141             :       std::swap(LHS, RHS);
    6142             :     if (isBoolSGPR(RHS.getOperand(0)))
    6143          16 :       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
    6144          64 :                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
    6145             :   }
    6146             : 
    6147       26690 :   return SDValue();
    6148             : }
    6149             : 
    6150       15704 : SDValue SITargetLowering::performOrCombine(SDNode *N,
    6151             :                                            DAGCombinerInfo &DCI) const {
    6152       15704 :   SelectionDAG &DAG = DCI.DAG;
    6153       15704 :   SDValue LHS = N->getOperand(0);
    6154       15704 :   SDValue RHS = N->getOperand(1);
    6155             : 
    6156             :   EVT VT = N->getValueType(0);
    6157             :   if (VT == MVT::i1) {
    6158             :     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
    6159          85 :     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    6160             :         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
    6161          14 :       SDValue Src = LHS.getOperand(0);
    6162             :       if (Src != RHS.getOperand(0))
    6163           1 :         return SDValue();
    6164             : 
    6165             :       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    6166             :       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    6167          13 :       if (!CLHS || !CRHS)
    6168           0 :         return SDValue();
    6169             : 
    6170             :       // Only 10 bits are used.
    6171             :       static const uint32_t MaxMask = 0x3ff;
    6172             : 
    6173          26 :       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
    6174             :       SDLoc DL(N);
    6175             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6176          26 :                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
    6177             :     }
    6178             : 
    6179          71 :     return SDValue();
    6180             :   }
    6181             : 
    6182             :   if (VT != MVT::i64)
    6183       13594 :     return SDValue();
    6184             : 
    6185             :   // TODO: This could be a generic combine with a predicate for extracting the
    6186             :   // high half of an integer being free.
    6187             : 
    6188             :   // (or i64:x, (zero_extend i32:y)) ->
    6189             :   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
    6190        2025 :   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
    6191             :       RHS.getOpcode() != ISD::ZERO_EXTEND)
    6192             :     std::swap(LHS, RHS);
    6193             : 
    6194        2025 :   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
    6195        1094 :     SDValue ExtSrc = RHS.getOperand(0);
    6196             :     EVT SrcVT = ExtSrc.getValueType();
    6197             :     if (SrcVT == MVT::i32) {
    6198             :       SDLoc SL(N);
    6199             :       SDValue LowLHS, HiBits;
    6200        2188 :       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
    6201        1094 :       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
    6202             : 
    6203        1094 :       DCI.AddToWorklist(LowOr.getNode());
    6204        1094 :       DCI.AddToWorklist(HiBits.getNode());
    6205             : 
    6206             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    6207        1094 :                                 LowOr, HiBits);
    6208        1094 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    6209             :     }
    6210             :   }
    6211             : 
    6212             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    6213             :   if (CRHS) {
    6214         161 :     if (SDValue Split
    6215         322 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
    6216          51 :       return Split;
    6217             :   }
    6218             : 
    6219         880 :   return SDValue();
    6220             : }
    6221             : 
    6222        1416 : SDValue SITargetLowering::performXorCombine(SDNode *N,
    6223             :                                             DAGCombinerInfo &DCI) const {
    6224             :   EVT VT = N->getValueType(0);
    6225             :   if (VT != MVT::i64)
    6226         842 :     return SDValue();
    6227             : 
    6228         574 :   SDValue LHS = N->getOperand(0);
    6229         574 :   SDValue RHS = N->getOperand(1);
    6230             : 
    6231             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6232             :   if (CRHS) {
    6233         288 :     if (SDValue Split
    6234         576 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
    6235          26 :       return Split;
    6236             :   }
    6237             : 
    6238         548 :   return SDValue();
    6239             : }
    6240             : 
    6241             : // Instructions that will be lowered with a final instruction that zeros the
    6242             : // high result bits.
    6243             : // XXX - probably only need to list legal operations.
    6244         243 : static bool fp16SrcZerosHighBits(unsigned Opc) {
    6245         243 :   switch (Opc) {
    6246             :   case ISD::FADD:
    6247             :   case ISD::FSUB:
    6248             :   case ISD::FMUL:
    6249             :   case ISD::FDIV:
    6250             :   case ISD::FREM:
    6251             :   case ISD::FMA:
    6252             :   case ISD::FMAD:
    6253             :   case ISD::FCANONICALIZE:
    6254             :   case ISD::FP_ROUND:
    6255             :   case ISD::UINT_TO_FP:
    6256             :   case ISD::SINT_TO_FP:
    6257             :   case ISD::FABS:
    6258             :     // Fabs is lowered to a bit operation, but it's an and which will clear the
    6259             :     // high bits anyway.
    6260             :   case ISD::FSQRT:
    6261             :   case ISD::FSIN:
    6262             :   case ISD::FCOS:
    6263             :   case ISD::FPOWI:
    6264             :   case ISD::FPOW:
    6265             :   case ISD::FLOG:
    6266             :   case ISD::FLOG2:
    6267             :   case ISD::FLOG10:
    6268             :   case ISD::FEXP:
    6269             :   case ISD::FEXP2:
    6270             :   case ISD::FCEIL:
    6271             :   case ISD::FTRUNC:
    6272             :   case ISD::FRINT:
    6273             :   case ISD::FNEARBYINT:
    6274             :   case ISD::FROUND:
    6275             :   case ISD::FFLOOR:
    6276             :   case ISD::FMINNUM:
    6277             :   case ISD::FMAXNUM:
    6278             :   case AMDGPUISD::FRACT:
    6279             :   case AMDGPUISD::CLAMP:
    6280             :   case AMDGPUISD::COS_HW:
    6281             :   case AMDGPUISD::SIN_HW:
    6282             :   case AMDGPUISD::FMIN3:
    6283             :   case AMDGPUISD::FMAX3:
    6284             :   case AMDGPUISD::FMED3:
    6285             :   case AMDGPUISD::FMAD_FTZ:
    6286             :   case AMDGPUISD::RCP:
    6287             :   case AMDGPUISD::RSQ:
    6288             :   case AMDGPUISD::LDEXP:
    6289             :     return true;
    6290          86 :   default:
    6291             :     // fcopysign, select and others may be lowered to 32-bit bit operations
    6292             :     // which don't zero the high bits.
    6293          86 :     return false;
    6294             :   }
    6295             : }
    6296             : 
    6297       15989 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
    6298             :                                                    DAGCombinerInfo &DCI) const {
    6299       27459 :   if (!Subtarget->has16BitInsts() ||
    6300       11470 :       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6301       12426 :     return SDValue();
    6302             : 
    6303        3563 :   EVT VT = N->getValueType(0);
    6304             :   if (VT != MVT::i32)
    6305        1654 :     return SDValue();
    6306             : 
    6307        1909 :   SDValue Src = N->getOperand(0);
    6308             :   if (Src.getValueType() != MVT::i16)
    6309         267 :     return SDValue();
    6310             : 
    6311             :   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
    6312             :   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
    6313        1642 :   if (Src.getOpcode() == ISD::BITCAST) {
    6314         243 :     SDValue BCSrc = Src.getOperand(0);
    6315         243 :     if (BCSrc.getValueType() == MVT::f16 &&
    6316         243 :         fp16SrcZerosHighBits(BCSrc.getOpcode()))
    6317         471 :       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
    6318             :   }
    6319             : 
    6320        1485 :   return SDValue();
    6321             : }
    6322             : 
    6323          82 : SDValue SITargetLowering::performClassCombine(SDNode *N,
    6324             :                                               DAGCombinerInfo &DCI) const {
    6325          82 :   SelectionDAG &DAG = DCI.DAG;
    6326          82 :   SDValue Mask = N->getOperand(1);
    6327             : 
    6328             :   // fp_class x, 0 -> false
    6329             :   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
    6330          59 :     if (CMask->isNullValue())
    6331           4 :       return DAG.getConstant(0, SDLoc(N), MVT::i1);
    6332             :   }
    6333             : 
    6334          80 :   if (N->getOperand(0).isUndef())
    6335           2 :     return DAG.getUNDEF(MVT::i1);
    6336             : 
    6337          78 :   return SDValue();
    6338             : }
    6339             : 
    6340             : static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
    6341          61 :   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
    6342             :     return true;
    6343             : 
    6344          29 :   return DAG.isKnownNeverNaN(Op);
    6345             : }
    6346             : 
    6347         413 : static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
    6348             :                             const SISubtarget *ST, unsigned MaxDepth=5) {
    6349             :   // If source is a result of another standard FP operation it is already in
    6350             :   // canonical form.
    6351             : 
    6352         413 :   switch (Op.getOpcode()) {
    6353             :   default:
    6354             :     break;
    6355             : 
    6356             :   // These will flush denorms if required.
    6357             :   case ISD::FADD:
    6358             :   case ISD::FSUB:
    6359             :   case ISD::FMUL:
    6360             :   case ISD::FSQRT:
    6361             :   case ISD::FCEIL:
    6362             :   case ISD::FFLOOR:
    6363             :   case ISD::FMA:
    6364             :   case ISD::FMAD:
    6365             : 
    6366             :   case ISD::FCANONICALIZE:
    6367             :     return true;
    6368             : 
    6369             :   case ISD::FP_ROUND:
    6370          36 :     return Op.getValueType().getScalarType() != MVT::f16 ||
    6371             :            ST->hasFP16Denormals();
    6372             : 
    6373             :   case ISD::FP_EXTEND:
    6374          12 :     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
    6375             :            ST->hasFP16Denormals();
    6376             : 
    6377           0 :   case ISD::FP16_TO_FP:
    6378             :   case ISD::FP_TO_FP16:
    6379             :     return ST->hasFP16Denormals();
    6380             : 
    6381             :   // It can/will be lowered or combined as a bit operation.
    6382             :   // Need to check their input recursively to handle.
    6383          68 :   case ISD::FNEG:
    6384             :   case ISD::FABS:
    6385         136 :     return (MaxDepth > 0) &&
    6386         136 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
    6387             : 
    6388             :   case ISD::FSIN:
    6389             :   case ISD::FCOS:
    6390             :   case ISD::FSINCOS:
    6391          32 :     return Op.getValueType().getScalarType() != MVT::f16;
    6392             : 
    6393             :   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
    6394             :   // For such targets need to check their input recursively.
    6395          44 :   case ISD::FMINNUM:
    6396             :   case ISD::FMAXNUM:
    6397             :   case ISD::FMINNAN:
    6398             :   case ISD::FMAXNAN:
    6399             : 
    6400          22 :     if (ST->supportsMinMaxDenormModes() &&
    6401          66 :         DAG.isKnownNeverNaN(Op.getOperand(0)) &&
    6402           0 :         DAG.isKnownNeverNaN(Op.getOperand(1)))
    6403             :       return true;
    6404             : 
    6405          44 :     return (MaxDepth > 0) &&
    6406         100 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
    6407          12 :            isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
    6408             : 
    6409             :   case ISD::ConstantFP: {
    6410             :     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
    6411          24 :     return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
    6412             :   }
    6413             :   }
    6414             :   return false;
    6415             : }
    6416             : 
    6417             : // Constant fold canonicalize.
    6418         401 : SDValue SITargetLowering::performFCanonicalizeCombine(
    6419             :   SDNode *N,
    6420             :   DAGCombinerInfo &DCI) const {
    6421         401 :   SelectionDAG &DAG = DCI.DAG;
    6422         401 :   ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
    6423             : 
    6424         401 :   if (!CFP) {
    6425         301 :     SDValue N0 = N->getOperand(0);
    6426         301 :     EVT VT = N0.getValueType().getScalarType();
    6427         301 :     auto ST = getSubtarget();
    6428             : 
    6429         168 :     if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
    6430          28 :          (VT == MVT::f64 && ST->hasFP64Denormals()) ||
    6431         265 :          (VT == MVT::f16 && ST->hasFP16Denormals())) &&
    6432         160 :         DAG.isKnownNeverNaN(N0))
    6433          10 :       return N0;
    6434             : 
    6435             :     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
    6436             : 
    6437         583 :     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
    6438         289 :         isCanonicalized(DAG, N0, ST))
    6439          94 :       return N0;
    6440             : 
    6441         197 :     return SDValue();
    6442             :   }
    6443             : 
    6444             :   const APFloat &C = CFP->getValueAPF();
    6445             : 
    6446             :   // Flush denormals to 0 if not enabled.
    6447         100 :   if (C.isDenormal()) {
    6448          24 :     EVT VT = N->getValueType(0);
    6449          24 :     EVT SVT = VT.getScalarType();
    6450           4 :     if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
    6451           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6452             : 
    6453           4 :     if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
    6454           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6455             : 
    6456          16 :     if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
    6457           0 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6458             :   }
    6459             : 
    6460          96 :   if (C.isNaN()) {
    6461          42 :     EVT VT = N->getValueType(0);
    6462             :     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
    6463          42 :     if (C.isSignaling()) {
    6464             :       // Quiet a signaling NaN.
    6465          44 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6466             :     }
    6467             : 
    6468             :     // Make sure it is the canonical NaN bitpattern.
    6469             :     //
    6470             :     // TODO: Can we use -1 as the canonical NaN value since it's an inline
    6471             :     // immediate?
    6472          60 :     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
    6473          28 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6474             :   }
    6475             : 
    6476          60 :   return N->getOperand(0);
    6477             : }
    6478             : 
    6479             : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
    6480          51 :   switch (Opc) {
    6481             :   case ISD::FMAXNUM:
    6482             :     return AMDGPUISD::FMAX3;
    6483           5 :   case ISD::SMAX:
    6484             :     return AMDGPUISD::SMAX3;
    6485           5 :   case ISD::UMAX:
    6486             :     return AMDGPUISD::UMAX3;
    6487          12 :   case ISD::FMINNUM:
    6488             :     return AMDGPUISD::FMIN3;
    6489           8 :   case ISD::SMIN:
    6490             :     return AMDGPUISD::SMIN3;
    6491           5 :   case ISD::UMIN:
    6492             :     return AMDGPUISD::UMIN3;
    6493           0 :   default:
    6494           0 :     llvm_unreachable("Not a min/max opcode");
    6495             :   }
    6496             : }
    6497             : 
    6498         152 : SDValue SITargetLowering::performIntMed3ImmCombine(
    6499             :   SelectionDAG &DAG, const SDLoc &SL,
    6500             :   SDValue Op0, SDValue Op1, bool Signed) const {
    6501             :   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
    6502             :   if (!K1)
    6503          92 :     return SDValue();
    6504             : 
    6505             :   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
    6506             :   if (!K0)
    6507           3 :     return SDValue();
    6508             : 
    6509          57 :   if (Signed) {
    6510          48 :     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
    6511           3 :       return SDValue();
    6512             :   } else {
    6513           9 :     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
    6514           3 :       return SDValue();
    6515             :   }
    6516             : 
    6517          51 :   EVT VT = K0->getValueType(0);
    6518          51 :   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
    6519           8 :   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
    6520             :     return DAG.getNode(Med3Opc, SL, VT,
    6521          49 :                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
    6522             :   }
    6523             : 
    6524             :   // If there isn't a 16-bit med3 operation, convert to 32-bit.
    6525             :   MVT NVT = MVT::i32;
    6526           2 :   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    6527             : 
    6528           2 :   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
    6529           2 :   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
    6530           2 :   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
    6531             : 
    6532           2 :   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
    6533           2 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
    6534             : }
    6535             : 
    6536         775 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
    6537             :   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
    6538             :     return C;
    6539             : 
    6540             :   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
    6541          43 :     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
    6542             :       return C;
    6543             :   }
    6544             : 
    6545             :   return nullptr;
    6546             : }
    6547             : 
    6548         454 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    6549             :                                                   const SDLoc &SL,
    6550             :                                                   SDValue Op0,
    6551             :                                                   SDValue Op1) const {
    6552         454 :   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
    6553         454 :   if (!K1)
    6554         133 :     return SDValue();
    6555             : 
    6556         321 :   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
    6557         321 :   if (!K0)
    6558           3 :     return SDValue();
    6559             : 
    6560             :   // Ordered >= (although NaN inputs should have folded away by now).
    6561         318 :   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
    6562         318 :   if (Cmp == APFloat::cmpGreaterThan)
    6563           8 :     return SDValue();
    6564             : 
    6565             :   // TODO: Check IEEE bit enabled?
    6566         310 :   EVT VT = Op0.getValueType();
    6567         310 :   if (Subtarget->enableDX10Clamp()) {
    6568             :     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
    6569             :     // hardware fmed3 behavior converting to a min.
    6570             :     // FIXME: Should this be allowing -0.0?
    6571         552 :     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
    6572         248 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
    6573             :   }
    6574             : 
    6575             :   // med3 for f16 is only available on gfx9+, and not available for v2f16.
    6576          10 :   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
    6577             :     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    6578             :     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
    6579             :     // then give the other result, which is different from med3 with a NaN
    6580             :     // input.
    6581          53 :     SDValue Var = Op0.getOperand(0);
    6582          26 :     if (!isKnownNeverSNan(DAG, Var))
    6583          15 :       return SDValue();
    6584             : 
    6585             :     return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
    6586          38 :                        Var, SDValue(K0, 0), SDValue(K1, 0));
    6587             :   }
    6588             : 
    6589           9 :   return SDValue();
    6590             : }
    6591             : 
    6592        2935 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    6593             :                                                DAGCombinerInfo &DCI) const {
    6594        2935 :   SelectionDAG &DAG = DCI.DAG;
    6595             : 
    6596             :   EVT VT = N->getValueType(0);
    6597             :   unsigned Opc = N->getOpcode();
    6598        2935 :   SDValue Op0 = N->getOperand(0);
    6599        2935 :   SDValue Op1 = N->getOperand(1);
    6600             : 
    6601             :   // Only do this if the inner op has one use since this will just increases
    6602             :   // register pressure for no benefit.
    6603             : 
    6604             : 
    6605        2935 :   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
    6606        2935 :       VT != MVT::f64 &&
    6607         436 :       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
    6608             :     // max(max(a, b), c) -> max3(a, b, c)
    6609             :     // min(min(a, b), c) -> min3(a, b, c)
    6610        2614 :     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
    6611             :       SDLoc DL(N);
    6612             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    6613             :                          DL,
    6614             :                          N->getValueType(0),
    6615             :                          Op0.getOperand(0),
    6616             :                          Op0.getOperand(1),
    6617          41 :                          Op1);
    6618             :     }
    6619             : 
    6620             :     // Try commuted.
    6621             :     // max(a, max(b, c)) -> max3(a, b, c)
    6622             :     // min(a, min(b, c)) -> min3(a, b, c)
    6623        2536 :     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
    6624             :       SDLoc DL(N);
    6625             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    6626             :                          DL,
    6627             :                          N->getValueType(0),
    6628             :                          Op0,
    6629             :                          Op1.getOperand(0),
    6630          10 :                          Op1.getOperand(1));
    6631             :     }
    6632             :   }
    6633             : 
    6634             :   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
    6635        2980 :   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
    6636         180 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
    6637          45 :       return Med3;
    6638             :   }
    6639             : 
    6640        2907 :   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
    6641         124 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
    6642           6 :       return Med3;
    6643             :   }
    6644             : 
    6645             :   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
    6646         941 :   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
    6647          21 :        (Opc == AMDGPUISD::FMIN_LEGACY &&
    6648             :         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
    6649             :       (VT == MVT::f32 || VT == MVT::f64 ||
    6650          70 :        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
    6651        3324 :        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
    6652             :       Op0.hasOneUse()) {
    6653         908 :     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
    6654         286 :       return Res;
    6655             :   }
    6656             : 
    6657        2547 :   return SDValue();
    6658             : }
    6659             : 
    6660         160 : static bool isClampZeroToOne(SDValue A, SDValue B) {
    6661             :   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
    6662             :     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
    6663             :       // FIXME: Should this be allowing -0.0?
    6664         173 :       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
    6665          12 :              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
    6666             :     }
    6667             :   }
    6668             : 
    6669             :   return false;
    6670             : }
    6671             : 
    6672             : // FIXME: Should only worry about snans for version with chain.
    6673         107 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
    6674             :                                               DAGCombinerInfo &DCI) const {
    6675         107 :   EVT VT = N->getValueType(0);
    6676             :   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
    6677             :   // NaNs. With a NaN input, the order of the operands may change the result.
    6678             : 
    6679         107 :   SelectionDAG &DAG = DCI.DAG;
    6680             :   SDLoc SL(N);
    6681             : 
    6682         107 :   SDValue Src0 = N->getOperand(0);
    6683         107 :   SDValue Src1 = N->getOperand(1);
    6684         107 :   SDValue Src2 = N->getOperand(2);
    6685             : 
    6686         107 :   if (isClampZeroToOne(Src0, Src1)) {
    6687             :     // const_a, const_b, x -> clamp is safe in all cases including signaling
    6688             :     // nans.
    6689             :     // FIXME: Should this be allowing -0.0?
    6690          36 :     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
    6691             :   }
    6692             : 
    6693             :   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
    6694             :   // handling no dx10-clamp?
    6695          71 :   if (Subtarget->enableDX10Clamp()) {
    6696             :     // If NaNs is clamped to 0, we are free to reorder the inputs.
    6697             : 
    6698             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    6699             :       std::swap(Src0, Src1);
    6700             : 
    6701             :     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
    6702             :       std::swap(Src1, Src2);
    6703             : 
    6704             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    6705             :       std::swap(Src0, Src1);
    6706             : 
    6707          53 :     if (isClampZeroToOne(Src1, Src2))
    6708          12 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
    6709             :   }
    6710             : 
    6711          59 :   return SDValue();
    6712             : }
    6713             : 
    6714         133 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    6715             :                                                  DAGCombinerInfo &DCI) const {
    6716         133 :   SDValue Src0 = N->getOperand(0);
    6717         133 :   SDValue Src1 = N->getOperand(1);
    6718         133 :   if (Src0.isUndef() && Src1.isUndef())
    6719           6 :     return DCI.DAG.getUNDEF(N->getValueType(0));
    6720         130 :   return SDValue();
    6721             : }
    6722             : 
    6723      107238 : SDValue SITargetLowering::performExtractVectorEltCombine(
    6724             :   SDNode *N, DAGCombinerInfo &DCI) const {
    6725      107238 :   SDValue Vec = N->getOperand(0);
    6726             : 
    6727      107238 :   SelectionDAG &DAG = DCI.DAG;
    6728      107238 :   if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
    6729             :     SDLoc SL(N);
    6730          20 :     EVT EltVT = N->getValueType(0);
    6731          20 :     SDValue Idx = N->getOperand(1);
    6732             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    6733          20 :                               Vec.getOperand(0), Idx);
    6734          20 :     return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
    6735             :   }
    6736             : 
    6737      107218 :   return SDValue();
    6738             : }
    6739             : 
    6740        1104 : static bool convertBuildVectorCastElt(SelectionDAG &DAG,
    6741             :                                       SDValue &Lo, SDValue &Hi) {
    6742             :   if (Hi.getOpcode() == ISD::BITCAST &&
    6743        1104 :       Hi.getOperand(0).getValueType() == MVT::f16 &&
    6744           4 :       (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
    6745           4 :     Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
    6746           2 :     Hi = Hi.getOperand(0);
    6747           2 :     return true;
    6748             :   }
    6749             : 
    6750             :   return false;
    6751             : }
    6752             : 
    6753       96979 : SDValue SITargetLowering::performBuildVectorCombine(
    6754             :   SDNode *N, DAGCombinerInfo &DCI) const {
    6755             :   SDLoc SL(N);
    6756             : 
    6757             :   if (!isTypeLegal(MVT::v2i16))
    6758       84946 :     return SDValue();
    6759       12033 :   SelectionDAG &DAG = DCI.DAG;
    6760       12033 :   EVT VT = N->getValueType(0);
    6761             : 
    6762             :   if (VT == MVT::v2i16) {
    6763         553 :     SDValue Lo = N->getOperand(0);
    6764         553 :     SDValue Hi = N->getOperand(1);
    6765             : 
    6766             :     // v2i16 build_vector (const|undef), (bitcast f16:$x)
    6767             :     // -> bitcast (v2f16 build_vector const|undef, $x
    6768         553 :     if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
    6769           4 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
    6770           2 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    6771             :     }
    6772             : 
    6773         551 :     if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
    6774           0 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
    6775           0 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    6776             :     }
    6777             :   }
    6778             : 
    6779       12031 :   return SDValue();
    6780             : }
    6781             : 
    6782         198 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
    6783             :                                           const SDNode *N0,
    6784             :                                           const SDNode *N1) const {
    6785         198 :   EVT VT = N0->getValueType(0);
    6786             : 
    6787             :   // Only do this if we are not trying to support denormals. v_mad_f32 does not
    6788             :   // support denormals ever.
    6789          92 :   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
    6790          60 :       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
    6791             :     return ISD::FMAD;
    6792             : 
    6793             :   const TargetOptions &Options = DAG.getTarget().Options;
    6794         180 :   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    6795          51 :        (N0->getFlags().hasUnsafeAlgebra() &&
    6796         144 :         N1->getFlags().hasUnsafeAlgebra())) &&
    6797          48 :       isFMAFasterThanFMulAndFAdd(VT)) {
    6798             :     return ISD::FMA;
    6799             :   }
    6800             : 
    6801             :   return 0;
    6802             : }
    6803             : 
    6804          20 : static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
    6805             :                            EVT VT,
    6806             :                            SDValue N0, SDValue N1, SDValue N2,
    6807             :                            bool Signed) {
    6808          20 :   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
    6809          20 :   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
    6810          20 :   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
    6811          20 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
    6812             : }
    6813             : 
    6814      130460 : SDValue SITargetLowering::performAddCombine(SDNode *N,
    6815             :                                             DAGCombinerInfo &DCI) const {
    6816      130460 :   SelectionDAG &DAG = DCI.DAG;
    6817      130460 :   EVT VT = N->getValueType(0);
    6818             :   SDLoc SL(N);
    6819      130460 :   SDValue LHS = N->getOperand(0);
    6820      130460 :   SDValue RHS = N->getOperand(1);
    6821             : 
    6822      130208 :   if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
    6823        1892 :       && Subtarget->hasMad64_32() &&
    6824      130913 :       !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
    6825             :       VT.getScalarSizeInBits() <= 64) {
    6826          24 :     if (LHS.getOpcode() != ISD::MUL)
    6827             :       std::swap(LHS, RHS);
    6828             : 
    6829          24 :     SDValue MulLHS = LHS.getOperand(0);
    6830          24 :     SDValue MulRHS = LHS.getOperand(1);
    6831          24 :     SDValue AddRHS = RHS;
    6832             : 
    6833             :     // TODO: Maybe restrict if SGPR inputs.
    6834          38 :     if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
    6835          14 :         numBitsUnsigned(MulRHS, DAG) <= 32) {
    6836          13 :       MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
    6837          13 :       MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
    6838          13 :       AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
    6839          13 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
    6840             :     }
    6841             : 
    6842          18 :     if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
    6843           7 :       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
    6844           7 :       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
    6845           7 :       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
    6846           7 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
    6847             :     }
    6848             : 
    6849           4 :     return SDValue();
    6850             :   }
    6851             : 
    6852             :   if (VT != MVT::i32)
    6853      111007 :     return SDValue();
    6854             : 
    6855             :   // add x, zext (setcc) => addcarry x, 0, setcc
    6856             :   // add x, sext (setcc) => subcarry x, 0, setcc
    6857             :   unsigned Opc = LHS.getOpcode();
    6858       38858 :   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
    6859       19429 :       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
    6860             :     std::swap(RHS, LHS);
    6861             : 
    6862             :   Opc = RHS.getOpcode();
    6863       19429 :   switch (Opc) {
    6864             :   default: break;
    6865          64 :   case ISD::ZERO_EXTEND:
    6866             :   case ISD::SIGN_EXTEND:
    6867             :   case ISD::ANY_EXTEND: {
    6868          64 :     auto Cond = RHS.getOperand(0);
    6869             :     if (!isBoolSGPR(Cond))
    6870             :       break;
    6871           9 :     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
    6872          18 :     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
    6873           9 :     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
    6874           9 :     return DAG.getNode(Opc, SL, VTList, Args);
    6875             :   }
    6876           0 :   case ISD::ADDCARRY: {
    6877             :     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
    6878             :     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    6879           0 :     if (!C || C->getZExtValue() != 0) break;
    6880           0 :     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
    6881           0 :     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
    6882             :   }
    6883             :   }
    6884       19420 :   return SDValue();
    6885             : }
    6886             : 
    6887        2442 : SDValue SITargetLowering::performSubCombine(SDNode *N,
    6888             :                                             DAGCombinerInfo &DCI) const {
    6889        2442 :   SelectionDAG &DAG = DCI.DAG;
    6890             :   EVT VT = N->getValueType(0);
    6891             : 
    6892             :   if (VT != MVT::i32)
    6893         615 :     return SDValue();
    6894             : 
    6895             :   SDLoc SL(N);
    6896        1827 :   SDValue LHS = N->getOperand(0);
    6897        1827 :   SDValue RHS = N->getOperand(1);
    6898             : 
    6899             :   unsigned Opc = LHS.getOpcode();
    6900        1827 :   if (Opc != ISD::SUBCARRY)
    6901             :     std::swap(RHS, LHS);
    6902             : 
    6903        1827 :   if (LHS.getOpcode() == ISD::SUBCARRY) {
    6904             :     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
    6905             :     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    6906           1 :     if (!C || C->getZExtValue() != 0)
    6907           0 :       return SDValue();
    6908           1 :     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
    6909           2 :     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
    6910             :   }
    6911        1826 :   return SDValue();
    6912             : }
    6913             : 
    6914         650 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
    6915             :   DAGCombinerInfo &DCI) const {
    6916             : 
    6917             :   if (N->getValueType(0) != MVT::i32)
    6918           0 :     return SDValue();
    6919             : 
    6920             :   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
    6921         232 :   if (!C || C->getZExtValue() != 0)
    6922         418 :     return SDValue();
    6923             : 
    6924         232 :   SelectionDAG &DAG = DCI.DAG;
    6925         232 :   SDValue LHS = N->getOperand(0);
    6926             : 
    6927             :   // addcarry (add x, y), 0, cc => addcarry x, y, cc
    6928             :   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
    6929             :   unsigned LHSOpc = LHS.getOpcode();
    6930             :   unsigned Opc = N->getOpcode();
    6931         464 :   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
    6932         232 :       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
    6933           1 :     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
    6934           2 :     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
    6935             :   }
    6936         231 :   return SDValue();
    6937             : }
    6938             : 
    6939        6594 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
    6940             :                                              DAGCombinerInfo &DCI) const {
    6941        6594 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6942        4613 :     return SDValue();
    6943             : 
    6944        1981 :   SelectionDAG &DAG = DCI.DAG;
    6945        1981 :   EVT VT = N->getValueType(0);
    6946             : 
    6947             :   SDLoc SL(N);
    6948        1981 :   SDValue LHS = N->getOperand(0);
    6949        1981 :   SDValue RHS = N->getOperand(1);
    6950             : 
    6951             :   // These should really be instruction patterns, but writing patterns with
    6952             :   // source modiifiers is a pain.
    6953             : 
    6954             :   // fadd (fadd (a, a), b) -> mad 2.0, a, b
    6955        1981 :   if (LHS.getOpcode() == ISD::FADD) {
    6956         285 :     SDValue A = LHS.getOperand(0);
    6957             :     if (A == LHS.getOperand(1)) {
    6958          96 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    6959          96 :       if (FusedOp != 0) {
    6960          64 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    6961          64 :         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
    6962             :       }
    6963             :     }
    6964             :   }
    6965             : 
    6966             :   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
    6967        1917 :   if (RHS.getOpcode() == ISD::FADD) {
    6968          88 :     SDValue A = RHS.getOperand(0);
    6969             :     if (A == RHS.getOperand(1)) {
    6970          32 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    6971          32 :       if (FusedOp != 0) {
    6972          22 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    6973          22 :         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
    6974             :       }
    6975             :     }
    6976             :   }
    6977             : 
    6978        1895 :   return SDValue();
    6979             : }
    6980             : 
    6981        1833 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
    6982             :                                              DAGCombinerInfo &DCI) const {
    6983        1833 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6984        1266 :     return SDValue();
    6985             : 
    6986         567 :   SelectionDAG &DAG = DCI.DAG;
    6987             :   SDLoc SL(N);
    6988         567 :   EVT VT = N->getValueType(0);
    6989             :   assert(!VT.isVector());
    6990             : 
    6991             :   // Try to get the fneg to fold into the source modifier. This undoes generic
    6992             :   // DAG combines and folds them into the mad.
    6993             :   //
    6994             :   // Only do this if we are not trying to support denormals. v_mad_f32 does
    6995             :   // not support denormals ever.
    6996         567 :   SDValue LHS = N->getOperand(0);
    6997         567 :   SDValue RHS = N->getOperand(1);
    6998         567 :   if (LHS.getOpcode() == ISD::FADD) {
    6999             :     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
    7000          48 :     SDValue A = LHS.getOperand(0);
    7001             :     if (A == LHS.getOperand(1)) {
    7002          26 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    7003          26 :       if (FusedOp != 0){
    7004          19 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7005          19 :         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    7006             : 
    7007          19 :         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
    7008             :       }
    7009             :     }
    7010             :   }
    7011             : 
    7012         548 :   if (RHS.getOpcode() == ISD::FADD) {
    7013             :     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
    7014             : 
    7015          53 :     SDValue A = RHS.getOperand(0);
    7016             :     if (A == RHS.getOperand(1)) {
    7017          44 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    7018          44 :       if (FusedOp != 0){
    7019          35 :         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
    7020          35 :         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
    7021             :       }
    7022             :     }
    7023             :   }
    7024             : 
    7025         513 :   return SDValue();
    7026             : }
    7027             : 
    7028        9119 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
    7029             :                                               DAGCombinerInfo &DCI) const {
    7030        9119 :   SelectionDAG &DAG = DCI.DAG;
    7031             :   SDLoc SL(N);
    7032             : 
    7033        9119 :   SDValue LHS = N->getOperand(0);
    7034        9119 :   SDValue RHS = N->getOperand(1);
    7035             :   EVT VT = LHS.getValueType();
    7036             :   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
    7037             : 
    7038             :   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
    7039             :   if (!CRHS) {
    7040             :     CRHS = dyn_cast<ConstantSDNode>(LHS);
    7041             :     if (CRHS) {
    7042             :       std::swap(LHS, RHS);
    7043           0 :       CC = getSetCCSwappedOperands(CC);
    7044             :     }
    7045             :   }
    7046             : 
    7047       13233 :   if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
    7048             :       isBoolSGPR(LHS.getOperand(0))) {
    7049             :     // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
    7050             :     // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
    7051             :     // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
    7052             :     // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
    7053           3 :     if ((CRHS->isAllOnesValue() &&
    7054           3 :          (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
    7055           0 :         (CRHS->isNullValue() &&
    7056           0 :          (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
    7057             :       return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    7058           6 :                          DAG.getConstant(-1, SL, MVT::i1));
    7059           0 :     if ((CRHS->isAllOnesValue() &&
    7060           0 :          (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
    7061           0 :         (CRHS->isNullValue() &&
    7062           0 :          (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
    7063           0 :       return LHS.getOperand(0);
    7064             :   }
    7065             : 
    7066        7396 :   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
    7067             :                                            VT != MVT::f16))
    7068        2877 :     return SDValue();
    7069             : 
    7070             :   // Match isinf pattern
    7071             :   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
    7072        6390 :   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
    7073             :     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    7074             :     if (!CRHS)
    7075           0 :       return SDValue();
    7076             : 
    7077             :     const APFloat &APF = CRHS->getValueAPF();
    7078           4 :     if (APF.isInfinity() && !APF.isNegative()) {
    7079             :       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
    7080             :       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
    7081           4 :                          DAG.getConstant(Mask, SL, MVT::i32));
    7082             :     }
    7083             :   }
    7084             : 
    7085        6237 :   return SDValue();
    7086             : }
    7087             : 
    7088         360 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
    7089             :                                                      DAGCombinerInfo &DCI) const {
    7090         360 :   SelectionDAG &DAG = DCI.DAG;
    7091             :   SDLoc SL(N);
    7092         360 :   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
    7093             : 
    7094         360 :   SDValue Src = N->getOperand(0);
    7095         360 :   SDValue Srl = N->getOperand(0);
    7096         360 :   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
    7097          52 :     Srl = Srl.getOperand(0);
    7098             : 
    7099             :   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
    7100         360 :   if (Srl.getOpcode() == ISD::SRL) {
    7101             :     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
    7102             :     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
    7103             :     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
    7104             : 
    7105             :     if (const ConstantSDNode *C =
    7106             :         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
    7107         118 :       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
    7108          59 :                                EVT(MVT::i32));
    7109             : 
    7110          59 :       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
    7111          59 :       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
    7112          59 :         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
    7113          59 :                            MVT::f32, Srl);
    7114             :       }
    7115             :     }
    7116             :   }
    7117             : 
    7118         301 :   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
    7119             : 
    7120         301 :   KnownBits Known;
    7121         301 :   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    7122         301 :                                         !DCI.isBeforeLegalizeOps());
    7123             :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    7124         602 :   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
    7125         301 :       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
    7126          95 :     DCI.CommitTargetLoweringOpt(TLO);
    7127             :   }
    7128             : 
    7129         301 :   return SDValue();
    7130             : }
    7131             : 
    7132     1175306 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    7133             :                                             DAGCombinerInfo &DCI) const {
    7134     1175306 :   switch (N->getOpcode()) {
    7135      255511 :   default:
    7136      255511 :     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7137      130460 :   case ISD::ADD:
    7138      130460 :     return performAddCombine(N, DCI);
    7139        2442 :   case ISD::SUB:
    7140        2442 :     return performSubCombine(N, DCI);
    7141         650 :   case ISD::ADDCARRY:
    7142             :   case ISD::SUBCARRY:
    7143         650 :     return performAddCarrySubCarryCombine(N, DCI);
    7144        6594 :   case ISD::FADD:
    7145        6594 :     return performFAddCombine(N, DCI);
    7146        1833 :   case ISD::FSUB:
    7147        1833 :     return performFSubCombine(N, DCI);
    7148        9119 :   case ISD::SETCC:
    7149        9119 :     return performSetCCCombine(N, DCI);
    7150        8395 :   case ISD::FMAXNUM:
    7151             :   case ISD::FMINNUM:
    7152             :   case ISD::SMAX:
    7153             :   case ISD::SMIN:
    7154             :   case ISD::UMAX:
    7155             :   case ISD::UMIN:
    7156             :   case AMDGPUISD::FMIN_LEGACY:
    7157             :   case AMDGPUISD::FMAX_LEGACY: {
    7158       11330 :     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
    7159        2935 :         getTargetMachine().getOptLevel() > CodeGenOpt::None)
    7160        2935 :       return performMinMaxCombine(N, DCI);
    7161             :     break;
    7162             :   }
    7163      492156 :   case ISD::LOAD:
    7164             :   case ISD::STORE:
    7165             :   case ISD::ATOMIC_LOAD:
    7166             :   case ISD::ATOMIC_STORE:
    7167             :   case ISD::ATOMIC_CMP_SWAP:
    7168             :   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
    7169             :   case ISD::ATOMIC_SWAP:
    7170             :   case ISD::ATOMIC_LOAD_ADD:
    7171             :   case ISD::ATOMIC_LOAD_SUB:
    7172             :   case ISD::ATOMIC_LOAD_AND:
    7173             :   case ISD::ATOMIC_LOAD_OR:
    7174             :   case ISD::ATOMIC_LOAD_XOR:
    7175             :   case ISD::ATOMIC_LOAD_NAND:
    7176             :   case ISD::ATOMIC_LOAD_MIN:
    7177             :   case ISD::ATOMIC_LOAD_MAX:
    7178             :   case ISD::ATOMIC_LOAD_UMIN:
    7179             :   case ISD::ATOMIC_LOAD_UMAX:
    7180             :   case AMDGPUISD::ATOMIC_INC:
    7181             :   case AMDGPUISD::ATOMIC_DEC:
    7182             :   case AMDGPUISD::ATOMIC_LOAD_FADD:
    7183             :   case AMDGPUISD::ATOMIC_LOAD_FMIN:
    7184             :   case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
    7185      492156 :     if (DCI.isBeforeLegalize())
    7186             :       break;
    7187      339815 :     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
    7188       28952 :   case ISD::AND:
    7189       28952 :     return performAndCombine(N, DCI);
    7190       15704 :   case ISD::OR:
    7191       15704 :     return performOrCombine(N, DCI);
    7192        1416 :   case ISD::XOR:
    7193        1416 :     return performXorCombine(N, DCI);
    7194       15989 :   case ISD::ZERO_EXTEND:
    7195       15989 :     return performZeroExtendCombine(N, DCI);
    7196          82 :   case AMDGPUISD::FP_CLASS:
    7197          82 :     return performClassCombine(N, DCI);
    7198         401 :   case ISD::FCANONICALIZE:
    7199         401 :     return performFCanonicalizeCombine(N, DCI);
    7200         595 :   case AMDGPUISD::FRACT:
    7201             :   case AMDGPUISD::RCP:
    7202             :   case AMDGPUISD::RSQ:
    7203             :   case AMDGPUISD::RCP_LEGACY:
    7204             :   case AMDGPUISD::RSQ_LEGACY:
    7205             :   case AMDGPUISD::RSQ_CLAMP:
    7206             :   case AMDGPUISD::LDEXP: {
    7207         595 :     SDValue Src = N->getOperand(0);
    7208         595 :     if (Src.isUndef())
    7209          11 :       return Src;
    7210             :     break;
    7211             :   }
    7212        1112 :   case ISD::SINT_TO_FP:
    7213             :   case ISD::UINT_TO_FP:
    7214        1112 :     return performUCharToFloatCombine(N, DCI);
    7215         360 :   case AMDGPUISD::CVT_F32_UBYTE0:
    7216             :   case AMDGPUISD::CVT_F32_UBYTE1:
    7217             :   case AMDGPUISD::CVT_F32_UBYTE2:
    7218             :   case AMDGPUISD::CVT_F32_UBYTE3:
    7219         360 :     return performCvtF32UByteNCombine(N, DCI);
    7220         107 :   case AMDGPUISD::FMED3:
    7221         107 :     return performFMed3Combine(N, DCI);
    7222         133 :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    7223         133 :     return performCvtPkRTZCombine(N, DCI);
    7224         118 :   case ISD::SCALAR_TO_VECTOR: {
    7225         118 :     SelectionDAG &DAG = DCI.DAG;
    7226         118 :     EVT VT = N->getValueType(0);
    7227             : 
    7228             :     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
    7229             :     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
    7230             :       SDLoc SL(N);
    7231          54 :       SDValue Src = N->getOperand(0);
    7232             :       EVT EltVT = Src.getValueType();
    7233             :       if (EltVT == MVT::f16)
    7234          20 :         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
    7235             : 
    7236          54 :       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
    7237          54 :       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
    7238             :     }
    7239             : 
    7240          64 :     break;
    7241             :   }
    7242      106198 :   case ISD::EXTRACT_VECTOR_ELT:
    7243      106198 :     return performExtractVectorEltCombine(N, DCI);
    7244       96979 :   case ISD::BUILD_VECTOR:
    7245       96979 :     return performBuildVectorCombine(N, DCI);
    7246             :   }
    7247      158449 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7248             : }
    7249             : 
    7250             : /// \brief Helper function for adjustWritemask
    7251             : static unsigned SubIdx2Lane(unsigned Idx) {
    7252             :   switch (Idx) {
    7253             :   default: return 0;
    7254             :   case AMDGPU::sub0: return 0;
    7255             :   case AMDGPU::sub1: return 1;
    7256             :   case AMDGPU::sub2: return 2;
    7257             :   case AMDGPU::sub3: return 3;
    7258             :   }
    7259             : }
    7260             : 
    7261             : /// \brief Adjust the writemask of MIMG instructions
    7262         444 : SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    7263             :                                           SelectionDAG &DAG) const {
    7264         444 :   SDNode *Users[4] = { nullptr };
    7265             :   unsigned Lane = 0;
    7266         888 :   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
    7267         444 :   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    7268             :   unsigned NewDmask = 0;
    7269             :   bool HasChain = Node->getNumValues() > 1;
    7270             : 
    7271         444 :   if (OldDmask == 0) {
    7272             :     // These are folded out, but on the chance it happens don't assert.
    7273             :     return Node;
    7274             :   }
    7275             : 
    7276             :   // Try to figure out the used register components
    7277             :   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    7278        1152 :        I != E; ++I) {
    7279             : 
    7280             :     // Don't look at users of the chain.
    7281         946 :     if (I.getUse().getResNo() != 0)
    7282         151 :       continue;
    7283             : 
    7284             :     // Abort if we can't understand the usage
    7285         795 :     if (!I->isMachineOpcode() ||
    7286             :         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    7287             :       return Node;
    7288             : 
    7289             :     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
    7290             :     // Note that subregs are packed, i.e. Lane==0 is the first bit set
    7291             :     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
    7292             :     // set, etc.
    7293         559 :     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    7294             : 
    7295             :     // Set which texture component corresponds to the lane.
    7296             :     unsigned Comp;
    7297        2973 :     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
    7298        1207 :       Comp = countTrailingZeros(Dmask);
    7299        1207 :       Dmask &= ~(1 << Comp);
    7300             :     }
    7301             : 
    7302             :     // Abort if we have more than one user per component
    7303         559 :     if (Users[Lane])
    7304             :       return Node;
    7305             : 
    7306         557 :     Users[Lane] = *I;
    7307         557 :     NewDmask |= 1 << Comp;
    7308             :   }
    7309             : 
    7310             :   // Abort if there's no change
    7311         206 :   if (NewDmask == OldDmask)
    7312             :     return Node;
    7313             : 
    7314             :   unsigned BitsSet = countPopulation(NewDmask);
    7315             : 
    7316          77 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7317         154 :   int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
    7318          77 :                                           Node->getMachineOpcode(), BitsSet);
    7319             :   assert(NewOpcode != -1 &&
    7320             :          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
    7321             :          "failed to find equivalent MIMG op");
    7322             : 
    7323             :   // Adjust the writemask in the node
    7324             :   SmallVector<SDValue, 12> Ops;
    7325          77 :   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
    7326         308 :   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
    7327         231 :   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
    7328             : 
    7329         154 :   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
    7330             : 
    7331             :   MVT ResultVT = BitsSet == 1 ?
    7332          77 :     SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
    7333             :   SDVTList NewVTList = HasChain ?
    7334         154 :     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
    7335             : 
    7336             : 
    7337         231 :   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
    7338          77 :                                               NewVTList, Ops);
    7339             : 
    7340          77 :   if (HasChain) {
    7341             :     // Update chain.
    7342          74 :     NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
    7343          74 :     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
    7344             :   }
    7345             : 
    7346          77 :   if (BitsSet == 1) {
    7347             :     assert(Node->hasNUsesOfValue(1, 0));
    7348          43 :     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
    7349         129 :                                       SDLoc(Node), Users[Lane]->getValueType(0),
    7350          43 :                                       SDValue(NewNode, 0));
    7351          43 :     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    7352          43 :     return nullptr;
    7353             :   }
    7354             : 
    7355             :   // Update the users of the node with the new indices
    7356         306 :   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    7357         136 :     SDNode *User = Users[i];
    7358         136 :     if (!User)
    7359          48 :       continue;
    7360             : 
    7361         264 :     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
    7362          88 :     DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
    7363             : 
    7364          88 :     switch (Idx) {
    7365             :     default: break;
    7366          34 :     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    7367          34 :     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    7368          20 :     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    7369             :     }
    7370             :   }
    7371             : 
    7372          34 :   DAG.RemoveDeadNode(Node);
    7373          34 :   return nullptr;
    7374             : }
    7375             : 
    7376             : static bool isFrameIndexOp(SDValue Op) {
    7377      308987 :   if (Op.getOpcode() == ISD::AssertZext)
    7378          62 :     Op = Op.getOperand(0);
    7379             : 
    7380             :   return isa<FrameIndexSDNode>(Op);
    7381             : }
    7382             : 
    7383             : /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
    7384             : /// with frame index operands.
    7385             : /// LLVM assumes that inputs are to these instructions are registers.
    7386       50232 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
    7387             :                                                         SelectionDAG &DAG) const {
    7388       50232 :   if (Node->getOpcode() == ISD::CopyToReg) {
    7389             :     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
    7390       11032 :     SDValue SrcVal = Node->getOperand(2);
    7391             : 
    7392             :     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
    7393             :     // to try understanding copies to physical registers.
    7394         163 :     if (SrcVal.getValueType() == MVT::i1 &&
    7395             :         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
    7396             :       SDLoc SL(Node);
    7397             :       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7398             :       SDValue VReg = DAG.getRegister(
    7399           8 :         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
    7400             : 
    7401             :       SDNode *Glued = Node->getGluedNode();
    7402             :       SDValue ToVReg
    7403             :         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
    7404          10 :                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
    7405             :       SDValue ToResultReg
    7406             :         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
    7407           8 :                            VReg, ToVReg.getValue(1));
    7408           8 :       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
    7409           8 :       DAG.RemoveDeadNode(Node);
    7410             :       return ToResultReg.getNode();
    7411             :     }
    7412             :   }
    7413             : 
    7414             :   SmallVector<SDValue, 8> Ops;
    7415      668198 :   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
    7416      308965 :     if (!isFrameIndexOp(Node->getOperand(i))) {
    7417      308965 :       Ops.push_back(Node->getOperand(i));
    7418      308965 :       continue;
    7419             :     }
    7420             : 
    7421             :     SDLoc DL(Node);
    7422          44 :     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
    7423             :                                      Node->getOperand(i).getValueType(),
    7424             :                                      Node->getOperand(i)), 0));
    7425             :   }
    7426             : 
    7427       50224 :   return DAG.UpdateNodeOperands(Node, Ops);
    7428             : }
    7429             : 
    7430             : /// \brief Fold the instructions after selecting them.
    7431             : /// Returns null if users were already updated.
    7432      340853 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
    7433             :                                           SelectionDAG &DAG) const {
    7434      340853 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7435      340853 :   unsigned Opcode = Node->getMachineOpcode();
    7436             : 
    7437      342071 :   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
    7438      341330 :       !TII->isGather4(Opcode) && !TII->isD16(Opcode)) {
    7439         444 :     return adjustWritemask(Node, DAG);
    7440             :   }
    7441             : 
    7442      680818 :   if (Opcode == AMDGPU::INSERT_SUBREG ||
    7443      340409 :       Opcode == AMDGPU::REG_SEQUENCE) {
    7444       39200 :     legalizeTargetIndependentNode(Node, DAG);
    7445       39200 :     return Node;
    7446             :   }
    7447             : 
    7448      301209 :   switch (Opcode) {
    7449         259 :   case AMDGPU::V_DIV_SCALE_F32:
    7450             :   case AMDGPU::V_DIV_SCALE_F64: {
    7451             :     // Satisfy the operand register constraint when one of the inputs is
    7452             :     // undefined. Ordinarily each undef value will have its own implicit_def of
    7453             :     // a vreg, so force these to use a single register.
    7454         259 :     SDValue Src0 = Node->getOperand(0);
    7455         259 :     SDValue Src1 = Node->getOperand(1);
    7456         259 :     SDValue Src2 = Node->getOperand(2);
    7457             : 
    7458         256 :     if ((Src0.isMachineOpcode() &&
    7459         259 :          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
    7460             :         (Src0 == Src1 || Src0 == Src2))
    7461             :       break;
    7462             : 
    7463             :     MVT VT = Src0.getValueType().getSimpleVT();
    7464           6 :     const TargetRegisterClass *RC = getRegClassFor(VT);
    7465             : 
    7466             :     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7467           6 :     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
    7468             : 
    7469          12 :     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
    7470          18 :                                       UndefReg, Src0, SDValue());
    7471             : 
    7472             :     // src0 must be the same register as src1 or src2, even if the value is
    7473             :     // undefined, so make sure we don't violate this constraint.
    7474           6 :     if (Src0.isMachineOpcode() &&
    7475             :         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
    7476           3 :       if (Src1.isMachineOpcode() &&
    7477             :           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7478             :         Src0 = Src1;
    7479           3 :       else if (Src2.isMachineOpcode() &&
    7480             :                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7481             :         Src0 = Src2;
    7482             :       else {
    7483             :         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
    7484           1 :         Src0 = UndefReg;
    7485             :         Src1 = UndefReg;
    7486             :       }
    7487             :     } else
    7488             :       break;
    7489             : 
    7490           6 :     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
    7491           6 :     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
    7492           0 :       Ops.push_back(Node->getOperand(I));
    7493             : 
    7494           3 :     Ops.push_back(ImpDef.getValue(1));
    7495           9 :     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    7496             :   }
    7497             :   default:
    7498             :     break;
    7499             :   }
    7500             : 
    7501      301206 :   return Node;
    7502             : }
    7503             : 
    7504             : /// \brief Assign the register class depending on the number of
    7505             : /// bits set in the writemask
    7506       31336 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
    7507             :                                                      SDNode *Node) const {
    7508       31336 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7509             : 
    7510             :   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    7511             : 
    7512       62672 :   if (TII->isVOP3(MI.getOpcode())) {
    7513             :     // Make sure constant bus requirements are respected.
    7514       29160 :     TII->legalizeOperandsVOP3(MRI, MI);
    7515       29160 :     return;
    7516             :   }
    7517             : 
    7518             :   // Replace unused atomics with the no return version.
    7519        2176 :   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
    7520        2176 :   if (NoRetAtomicOp != -1) {
    7521        1840 :     if (!Node->hasAnyUseOfValue(0)) {
    7522         946 :       MI.setDesc(TII->get(NoRetAtomicOp));
    7523         946 :       MI.RemoveOperand(0);
    7524         946 :       return;
    7525             :     }
    7526             : 
    7527             :     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
    7528             :     // instruction, because the return type of these instructions is a vec2 of
    7529             :     // the memory type, so it can be tied to the input operand.
    7530             :     // This means these instructions always have a use, so we need to add a
    7531             :     // special case to check if the atomic has only one extract_subreg use,
    7532             :     // which itself has no uses.
    7533        1786 :     if ((Node->hasNUsesOfValue(1, 0) &&
    7534         868 :          Node->use_begin()->isMachineOpcode() &&
    7535         906 :          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
    7536          12 :          !Node->use_begin()->hasAnyUseOfValue(0))) {
    7537             :       unsigned Def = MI.getOperand(0).getReg();
    7538             : 
    7539             :       // Change this into a noret atomic.
    7540           0 :       MI.setDesc(TII->get(NoRetAtomicOp));
    7541           0 :       MI.RemoveOperand(0);
    7542             : 
    7543             :       // If we only remove the def operand from the atomic instruction, the
    7544             :       // extract_subreg will be left with a use of a vreg without a def.
    7545             :       // So we need to insert an implicit_def to avoid machine verifier
    7546             :       // errors.
    7547           0 :       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
    7548             :               TII->get(AMDGPU::IMPLICIT_DEF), Def);
    7549             :     }
    7550             :     return;
    7551             :   }
    7552             : }
    7553             : 
    7554       44768 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
    7555             :                               uint64_t Val) {
    7556       44768 :   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
    7557       44768 :   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
    7558             : }
    7559             : 
    7560        4097 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
    7561             :                                                 const SDLoc &DL,
    7562             :                                                 SDValue Ptr) const {
    7563        4097 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7564             : 
    7565             :   // Build the half of the subregister with the constants before building the
    7566             :   // full 128-bit register. If we are building multiple resource descriptors,
    7567             :   // this will allow CSEing of the 2-component register.
    7568             :   const SDValue Ops0[] = {
    7569             :     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
    7570             :     buildSMovImm32(DAG, DL, 0),
    7571             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    7572        4097 :     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
    7573             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
    7574       16388 :   };
    7575             : 
    7576        4097 :   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
    7577             :                                                 MVT::v2i32, Ops0), 0);
    7578             : 
    7579             :   // Combine the constants and the pointer.
    7580             :   const SDValue Ops1[] = {
    7581             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    7582             :     Ptr,
    7583             :     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
    7584             :     SubRegHi,
    7585             :     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
    7586       12291 :   };
    7587             : 
    7588        4097 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
    7589             : }
    7590             : 
    7591             : /// \brief Return a resource descriptor with the 'Add TID' bit enabled
    7592             : ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
    7593             : ///        of the resource descriptor) to create an offset, which is added to
    7594             : ///        the resource pointer.
    7595       18287 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
    7596             :                                            SDValue Ptr, uint32_t RsrcDword1,
    7597             :                                            uint64_t RsrcDword2And3) const {
    7598       18287 :   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
    7599       18287 :   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
    7600       18287 :   if (RsrcDword1) {
    7601           0 :     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
    7602             :                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
    7603             :                     0);
    7604             :   }
    7605             : 
    7606             :   SDValue DataLo = buildSMovImm32(DAG, DL,
    7607       18287 :                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
    7608       18287 :   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
    7609             : 
    7610             :   const SDValue Ops[] = {
    7611             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    7612             :     PtrLo,
    7613             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    7614             :     PtrHi,
    7615             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
    7616             :     DataLo,
    7617             :     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
    7618             :     DataHi,
    7619             :     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
    7620       91435 :   };
    7621             : 
    7622       18287 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
    7623             : }
    7624             : 
    7625             : //===----------------------------------------------------------------------===//
    7626             : //                         SI Inline Assembly Support
    7627             : //===----------------------------------------------------------------------===//
    7628             : 
    7629             : std::pair<unsigned, const TargetRegisterClass *>
    7630        1917 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    7631             :                                                StringRef Constraint,
    7632             :                                                MVT VT) const {
    7633             :   if (!isTypeLegal(VT))
    7634         997 :     return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    7635             : 
    7636         920 :   if (Constraint.size() == 1) {
    7637         533 :     switch (Constraint[0]) {
    7638         278 :     case 's':
    7639             :     case 'r':
    7640         278 :       switch (VT.getSizeInBits()) {
    7641           0 :       default:
    7642           0 :         return std::make_pair(0U, nullptr);
    7643         145 :       case 32:
    7644             :       case 16:
    7645         145 :         return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
    7646          63 :       case 64:
    7647          63 :         return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
    7648          10 :       case 128:
    7649          10 :         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
    7650          44 :       case 256:
    7651          44 :         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
    7652          16 :       case 512:
    7653          16 :         return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
    7654             :       }
    7655             : 
    7656         255 :     case 'v':
    7657         255 :       switch (VT.getSizeInBits()) {
    7658           0 :       default:
    7659           0 :         return std::make_pair(0U, nullptr);
    7660         198 :       case 32:
    7661             :       case 16:
    7662         198 :         return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
    7663          36 :       case 64:
    7664          36 :         return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
    7665           0 :       case 96:
    7666           0 :         return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
    7667          21 :       case 128:
    7668          21 :         return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
    7669           0 :       case 256:
    7670           0 :         return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
    7671           0 :       case 512:
    7672           0 :         return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
    7673             :       }
    7674             :     }
    7675             :   }
    7676             : 
    7677         387 :   if (Constraint.size() > 1) {
    7678             :     const TargetRegisterClass *RC = nullptr;
    7679         387 :     if (Constraint[1] == 'v') {
    7680             :       RC = &AMDGPU::VGPR_32RegClass;
    7681         281 :     } else if (Constraint[1] == 's') {
    7682             :       RC = &AMDGPU::SGPR_32RegClass;
    7683             :     }
    7684             : 
    7685             :     if (RC) {
    7686             :       uint32_t Idx;
    7687         704 :       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
    7688         352 :       if (!Failed && Idx < RC->getNumRegs())
    7689             :         return std::make_pair(RC->getRegister(Idx), RC);
    7690             :     }
    7691             :   }
    7692         387 :   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    7693             : }
    7694             : 
    7695             : SITargetLowering::ConstraintType
    7696        6505 : SITargetLowering::getConstraintType(StringRef Constraint) const {
    7697        6505 :   if (Constraint.size() == 1) {
    7698        2385 :     switch (Constraint[0]) {
    7699             :     default: break;
    7700             :     case 's':
    7701             :     case 'v':
    7702             :       return C_RegisterClass;
    7703             :     }
    7704             :   }
    7705        4304 :   return TargetLowering::getConstraintType(Constraint);
    7706             : }
    7707             : 
    7708             : // Figure out which registers should be reserved for stack access. Only after
    7709             : // the function is legalized do we know all of the non-spill stack objects or if
    7710             : // calls are present.
    7711       16641 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    7712             :   MachineRegisterInfo &MRI = MF.getRegInfo();
    7713       16641 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    7714             :   const MachineFrameInfo &MFI = MF.getFrameInfo();
    7715             :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    7716             :   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    7717             : 
    7718       16641 :   if (Info->isEntryFunction()) {
    7719             :     // Callable functions have fixed registers used for stack access.
    7720       15474 :     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
    7721             :   }
    7722             : 
    7723             :   // We have to assume the SP is needed in case there are calls in the function
    7724             :   // during lowering. Calls are only detected after the function is
    7725             :   // lowered. We're about to reserve registers, so don't bother using it if we
    7726             :   // aren't really going to use it.
    7727       15474 :   bool NeedSP = !Info->isEntryFunction() ||
    7728       32113 :     MFI.hasVarSizedObjects() ||
    7729             :     MFI.hasCalls();
    7730             : 
    7731             :   if (NeedSP) {
    7732        1500 :     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
    7733             :     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
    7734             : 
    7735             :     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
    7736             :     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
    7737             :                                Info->getStackPtrOffsetReg()));
    7738        1500 :     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
    7739             :   }
    7740             : 
    7741       16641 :   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
    7742       16641 :   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
    7743       16641 :   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
    7744             :                      Info->getScratchWaveOffsetReg());
    7745             : 
    7746       16641 :   TargetLoweringBase::finalizeLowering(MF);
    7747       16641 : }
    7748             : 
    7749      452326 : void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
    7750             :                                                      KnownBits &Known,
    7751             :                                                      const APInt &DemandedElts,
    7752             :                                                      const SelectionDAG &DAG,
    7753             :                                                      unsigned Depth) const {
    7754      452326 :   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
    7755             :                                                 DAG, Depth);
    7756             : 
    7757      452326 :   if (getSubtarget()->enableHugePrivateBuffer())
    7758             :     return;
    7759             : 
    7760             :   // Technically it may be possible to have a dispatch with a single workitem
    7761             :   // that uses the full private memory size, but that's not really useful. We
    7762             :   // can't use vaddr in MUBUF instructions if we don't know the address
    7763             :   // calculation won't overflow, so assume the sign bit is never set.
    7764      452318 :   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
    7765      291951 : }

Generated by: LCOV version 1.13