LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 3011 3185 94.5 %
Date: 2018-06-17 00:07:59 Functions: 156 157 99.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Custom DAG lowering for SI
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifdef _MSC_VER
      16             : // Provide M_PI.
      17             : #define _USE_MATH_DEFINES
      18             : #endif
      19             : 
      20             : #include "SIISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUIntrinsicInfo.h"
      23             : #include "AMDGPUSubtarget.h"
      24             : #include "AMDGPUTargetMachine.h"
      25             : #include "SIDefines.h"
      26             : #include "SIInstrInfo.h"
      27             : #include "SIMachineFunctionInfo.h"
      28             : #include "SIRegisterInfo.h"
      29             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      30             : #include "Utils/AMDGPUBaseInfo.h"
      31             : #include "llvm/ADT/APFloat.h"
      32             : #include "llvm/ADT/APInt.h"
      33             : #include "llvm/ADT/ArrayRef.h"
      34             : #include "llvm/ADT/BitVector.h"
      35             : #include "llvm/ADT/SmallVector.h"
      36             : #include "llvm/ADT/Statistic.h"
      37             : #include "llvm/ADT/StringRef.h"
      38             : #include "llvm/ADT/StringSwitch.h"
      39             : #include "llvm/ADT/Twine.h"
      40             : #include "llvm/CodeGen/Analysis.h"
      41             : #include "llvm/CodeGen/CallingConvLower.h"
      42             : #include "llvm/CodeGen/DAGCombine.h"
      43             : #include "llvm/CodeGen/ISDOpcodes.h"
      44             : #include "llvm/CodeGen/MachineBasicBlock.h"
      45             : #include "llvm/CodeGen/MachineFrameInfo.h"
      46             : #include "llvm/CodeGen/MachineFunction.h"
      47             : #include "llvm/CodeGen/MachineInstr.h"
      48             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      49             : #include "llvm/CodeGen/MachineMemOperand.h"
      50             : #include "llvm/CodeGen/MachineModuleInfo.h"
      51             : #include "llvm/CodeGen/MachineOperand.h"
      52             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      53             : #include "llvm/CodeGen/SelectionDAG.h"
      54             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      55             : #include "llvm/CodeGen/TargetCallingConv.h"
      56             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      57             : #include "llvm/CodeGen/ValueTypes.h"
      58             : #include "llvm/IR/Constants.h"
      59             : #include "llvm/IR/DataLayout.h"
      60             : #include "llvm/IR/DebugLoc.h"
      61             : #include "llvm/IR/DerivedTypes.h"
      62             : #include "llvm/IR/DiagnosticInfo.h"
      63             : #include "llvm/IR/Function.h"
      64             : #include "llvm/IR/GlobalValue.h"
      65             : #include "llvm/IR/InstrTypes.h"
      66             : #include "llvm/IR/Instruction.h"
      67             : #include "llvm/IR/Instructions.h"
      68             : #include "llvm/IR/IntrinsicInst.h"
      69             : #include "llvm/IR/Type.h"
      70             : #include "llvm/Support/Casting.h"
      71             : #include "llvm/Support/CodeGen.h"
      72             : #include "llvm/Support/CommandLine.h"
      73             : #include "llvm/Support/Compiler.h"
      74             : #include "llvm/Support/ErrorHandling.h"
      75             : #include "llvm/Support/KnownBits.h"
      76             : #include "llvm/Support/MachineValueType.h"
      77             : #include "llvm/Support/MathExtras.h"
      78             : #include "llvm/Target/TargetOptions.h"
      79             : #include <cassert>
      80             : #include <cmath>
      81             : #include <cstdint>
      82             : #include <iterator>
      83             : #include <tuple>
      84             : #include <utility>
      85             : #include <vector>
      86             : 
      87             : using namespace llvm;
      88             : 
      89             : #define DEBUG_TYPE "si-lower"
      90             : 
      91             : STATISTIC(NumTailCalls, "Number of tail calls");
      92             : 
      93      101169 : static cl::opt<bool> EnableVGPRIndexMode(
      94             :   "amdgpu-vgpr-index-mode",
      95      101169 :   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
      96      303507 :   cl::init(false));
      97             : 
      98      101169 : static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
      99             :   "amdgpu-frame-index-zero-bits",
     100      101169 :   cl::desc("High bits of frame index assumed to be zero"),
     101      202338 :   cl::init(5),
     102      303507 :   cl::ReallyHidden);
     103             : 
     104             : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
     105          45 :   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
     106         373 :   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
     107         418 :     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
     108             :       return AMDGPU::SGPR0 + Reg;
     109             :     }
     110             :   }
     111           0 :   llvm_unreachable("Cannot allocate sgpr");
     112             : }
     113             : 
     114        2241 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
     115        2241 :                                    const SISubtarget &STI)
     116        2241 :     : AMDGPUTargetLowering(TM, STI) {
     117             :   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
     118             :   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
     119             : 
     120             :   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
     121             :   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
     122             : 
     123             :   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
     124             :   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
     125             :   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
     126             : 
     127             :   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
     128             :   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
     129             : 
     130             :   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
     131             :   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     132             : 
     133             :   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
     134             :   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     135             : 
     136             :   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
     137             :   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     138             : 
     139        2241 :   if (Subtarget->has16BitInsts()) {
     140             :     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     141             :     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
     142             : 
     143             :     // Unless there are also VOP3P operations, not operations are really legal.
     144             :     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     145             :     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
     146             :     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     147             :     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     148             :   }
     149             : 
     150        2241 :   computeRegisterProperties(STI.getRegisterInfo());
     151             : 
     152             :   // We need to custom lower vector stores from local memory
     153             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     154             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     155             :   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
     156             :   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
     157             :   setOperationAction(ISD::LOAD, MVT::i1, Custom);
     158             : 
     159             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     160             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     161             :   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
     162             :   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
     163             :   setOperationAction(ISD::STORE, MVT::i1, Custom);
     164             : 
     165             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     166             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
     167             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
     168             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
     169             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
     170             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
     171             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
     172             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
     173             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
     174             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
     175             : 
     176             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     177             :   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     178             : 
     179             :   setOperationAction(ISD::SELECT, MVT::i1, Promote);
     180             :   setOperationAction(ISD::SELECT, MVT::i64, Custom);
     181             :   setOperationAction(ISD::SELECT, MVT::f64, Promote);
     182             :   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
     183             : 
     184             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     185             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
     186             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     187             :   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     188             :   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
     189             : 
     190             :   setOperationAction(ISD::SETCC, MVT::i1, Promote);
     191             :   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     192             :   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     193             :   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
     194             : 
     195             :   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
     196             :   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
     197             : 
     198             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
     199             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
     200             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
     201             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
     202             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
     203             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
     204             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
     205             : 
     206             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     207             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
     208             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
     209             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
     210             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
     211             : 
     212             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
     213             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
     214             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
     215             : 
     216             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     217             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
     218             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
     219             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
     220             : 
     221             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     222             :   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
     223             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     224             :   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
     225             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     226             :   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     227             : 
     228             :   setOperationAction(ISD::UADDO, MVT::i32, Legal);
     229             :   setOperationAction(ISD::USUBO, MVT::i32, Legal);
     230             : 
     231             :   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
     232             :   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
     233             : 
     234             : #if 0
     235             :   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
     236             :   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
     237             : #endif
     238             : 
     239             :   // We only support LOAD/STORE and vector manipulation ops for vectors
     240             :   // with > 4 elements.
     241       17928 :   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
     242       20169 :         MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
     243     9268776 :     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     244     4625424 :       switch (Op) {
     245             :       case ISD::LOAD:
     246             :       case ISD::STORE:
     247             :       case ISD::BUILD_VECTOR:
     248             :       case ISD::BITCAST:
     249             :       case ISD::EXTRACT_VECTOR_ELT:
     250             :       case ISD::INSERT_VECTOR_ELT:
     251             :       case ISD::INSERT_SUBVECTOR:
     252             :       case ISD::EXTRACT_SUBVECTOR:
     253             :       case ISD::SCALAR_TO_VECTOR:
     254             :         break;
     255       17928 :       case ISD::CONCAT_VECTORS:
     256             :         setOperationAction(Op, VT, Custom);
     257             :         break;
     258     4446144 :       default:
     259             :         setOperationAction(Op, VT, Expand);
     260             :         break;
     261             :       }
     262             :     }
     263             :   }
     264             : 
     265             :   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
     266             : 
     267             :   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
     268             :   // is expanded to avoid having two separate loops in case the index is a VGPR.
     269             : 
     270             :   // Most operations are naturally 32-bit vector operations. We only support
     271             :   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
     272       11205 :   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
     273             :     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
     274             :     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     275             : 
     276             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     277             :     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
     278             : 
     279             :     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
     280             :     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
     281             : 
     282             :     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
     283             :     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
     284             :   }
     285             : 
     286             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     287             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     288             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     289             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     290             : 
     291             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
     292             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
     293             : 
     294             :   // Avoid stack access for these.
     295             :   // TODO: Generalize to more vector types.
     296             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
     297             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
     298             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     299             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     300             : 
     301             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     302             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     303             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
     304             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
     305             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
     306             : 
     307             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
     308             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
     309             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
     310             : 
     311             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
     312             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
     313             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     314             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     315             : 
     316             :   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
     317             :   // and output demarshalling
     318             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
     319             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
     320             : 
     321             :   // We can't return success/failure, only the old value,
     322             :   // let LLVM add the comparison
     323             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
     324             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
     325             : 
     326        2241 :   if (getSubtarget()->hasFlatAddressSpace()) {
     327             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     328             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
     329             :   }
     330             : 
     331             :   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
     332             :   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
     333             : 
     334             :   // On SI this is s_memtime and s_memrealtime on VI.
     335             :   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
     336             :   setOperationAction(ISD::TRAP, MVT::Other, Custom);
     337             :   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
     338             : 
     339             :   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     340             :   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     341             : 
     342        2241 :   if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     343             :     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     344             :     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     345             :     setOperationAction(ISD::FRINT, MVT::f64, Legal);
     346             :   }
     347             : 
     348             :   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     349             : 
     350             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     351             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     352             :   setOperationAction(ISD::FDIV, MVT::f32, Custom);
     353             :   setOperationAction(ISD::FDIV, MVT::f64, Custom);
     354             : 
     355        2241 :   if (Subtarget->has16BitInsts()) {
     356             :     setOperationAction(ISD::Constant, MVT::i16, Legal);
     357             : 
     358             :     setOperationAction(ISD::SMIN, MVT::i16, Legal);
     359             :     setOperationAction(ISD::SMAX, MVT::i16, Legal);
     360             : 
     361             :     setOperationAction(ISD::UMIN, MVT::i16, Legal);
     362             :     setOperationAction(ISD::UMAX, MVT::i16, Legal);
     363             : 
     364             :     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     365             :     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
     366             : 
     367             :     setOperationAction(ISD::ROTR, MVT::i16, Promote);
     368             :     setOperationAction(ISD::ROTL, MVT::i16, Promote);
     369             : 
     370             :     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     371             :     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     372             :     setOperationAction(ISD::SREM, MVT::i16, Promote);
     373             :     setOperationAction(ISD::UREM, MVT::i16, Promote);
     374             : 
     375             :     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     376             :     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
     377             : 
     378             :     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
     379             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     380             :     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     381             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
     382             :     setOperationAction(ISD::CTPOP, MVT::i16, Promote);
     383             : 
     384             :     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
     385             : 
     386             :     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
     387             : 
     388             :     setOperationAction(ISD::LOAD, MVT::i16, Custom);
     389             : 
     390             :     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     391             : 
     392             :     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
     393             :     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
     394             :     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     395             :     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
     396             : 
     397             :     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
     398             :     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     399             :     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
     400             :     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     401             : 
     402             :     // F16 - Constant Actions.
     403             :     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
     404             : 
     405             :     // F16 - Load/Store Actions.
     406             :     setOperationAction(ISD::LOAD, MVT::f16, Promote);
     407             :     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
     408             :     setOperationAction(ISD::STORE, MVT::f16, Promote);
     409             :     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
     410             : 
     411             :     // F16 - VOP1 Actions.
     412             :     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
     413             :     setOperationAction(ISD::FCOS, MVT::f16, Promote);
     414             :     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     415             :     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
     416             :     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     417             :     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     418             :     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
     419             :     setOperationAction(ISD::FROUND, MVT::f16, Custom);
     420             : 
     421             :     // F16 - VOP2 Actions.
     422             :     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     423             :     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     424             :     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     425             :     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     426             :     setOperationAction(ISD::FDIV, MVT::f16, Custom);
     427             : 
     428             :     // F16 - VOP3 Actions.
     429             :     setOperationAction(ISD::FMA, MVT::f16, Legal);
     430        1098 :     if (!Subtarget->hasFP16Denormals())
     431             :       setOperationAction(ISD::FMAD, MVT::f16, Legal);
     432             : 
     433        9882 :     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
     434     2270664 :       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     435     1133136 :         switch (Op) {
     436             :         case ISD::LOAD:
     437             :         case ISD::STORE:
     438             :         case ISD::BUILD_VECTOR:
     439             :         case ISD::BITCAST:
     440             :         case ISD::EXTRACT_VECTOR_ELT:
     441             :         case ISD::INSERT_VECTOR_ELT:
     442             :         case ISD::INSERT_SUBVECTOR:
     443             :         case ISD::EXTRACT_SUBVECTOR:
     444             :         case ISD::SCALAR_TO_VECTOR:
     445             :           break;
     446        4392 :         case ISD::CONCAT_VECTORS:
     447             :           setOperationAction(Op, VT, Custom);
     448             :           break;
     449     1089216 :         default:
     450             :           setOperationAction(Op, VT, Expand);
     451             :           break;
     452             :         }
     453             :       }
     454             :     }
     455             : 
     456             :     // XXX - Do these do anything? Vector constants turn into build_vector.
     457             :     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     458             :     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
     459             : 
     460             :     setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
     461             :     setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
     462             : 
     463             :     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     464             :     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     465             :     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
     466             :     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
     467             : 
     468             :     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
     469             :     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
     470             :     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
     471             :     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
     472             : 
     473             :     setOperationAction(ISD::AND, MVT::v2i16, Promote);
     474             :     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
     475             :     setOperationAction(ISD::OR, MVT::v2i16, Promote);
     476             :     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     477             :     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     478             :     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
     479             : 
     480             :     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
     481             :     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
     482             :     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
     483             :     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
     484             : 
     485             :     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
     486             :     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
     487             :     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     488             :     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
     489             : 
     490             :     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
     491             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     492             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
     493             :     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
     494             : 
     495             :     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
     496             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
     497             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
     498             : 
     499        1098 :     if (!Subtarget->hasVOP3PInsts()) {
     500             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
     501             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
     502             :     }
     503             : 
     504             :     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     505             :     // This isn't really legal, but this avoids the legalizer unrolling it (and
     506             :     // allows matching fneg (fabs x) patterns)
     507             :     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
     508             :   }
     509             : 
     510        2241 :   if (Subtarget->hasVOP3PInsts()) {
     511             :     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     512             :     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     513             :     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
     514             :     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
     515             :     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
     516             :     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
     517             :     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
     518             :     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
     519             :     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     520             :     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
     521             : 
     522             :     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     523             :     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     524             :     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     525             :     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     526             :     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
     527             :     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
     528             : 
     529             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     530             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     531             : 
     532             :     setOperationAction(ISD::SHL, MVT::v4i16, Custom);
     533             :     setOperationAction(ISD::SRA, MVT::v4i16, Custom);
     534             :     setOperationAction(ISD::SRL, MVT::v4i16, Custom);
     535             :     setOperationAction(ISD::ADD, MVT::v4i16, Custom);
     536             :     setOperationAction(ISD::SUB, MVT::v4i16, Custom);
     537             :     setOperationAction(ISD::MUL, MVT::v4i16, Custom);
     538             : 
     539             :     setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
     540             :     setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
     541             :     setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
     542             :     setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
     543             : 
     544             :     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     545             :     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
     546             :     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     547             :     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     548             : 
     549             :     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
     550             :     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
     551             :   }
     552             : 
     553             :   setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
     554             :   setOperationAction(ISD::FABS, MVT::v4f16, Custom);
     555             : 
     556        2241 :   if (Subtarget->has16BitInsts()) {
     557             :     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
     558             :     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
     559             :     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
     560             :     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
     561             :   } else {
     562             :     // Legalization hack.
     563             :     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     564             :     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
     565             : 
     566             :     setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
     567             :     setOperationAction(ISD::FABS, MVT::v2f16, Custom);
     568             :   }
     569             : 
     570       24651 :   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
     571             :     setOperationAction(ISD::SELECT, VT, Custom);
     572             :   }
     573             : 
     574             :   setTargetDAGCombine(ISD::ADD);
     575             :   setTargetDAGCombine(ISD::ADDCARRY);
     576             :   setTargetDAGCombine(ISD::SUB);
     577             :   setTargetDAGCombine(ISD::SUBCARRY);
     578             :   setTargetDAGCombine(ISD::FADD);
     579             :   setTargetDAGCombine(ISD::FSUB);
     580             :   setTargetDAGCombine(ISD::FMINNUM);
     581             :   setTargetDAGCombine(ISD::FMAXNUM);
     582             :   setTargetDAGCombine(ISD::SMIN);
     583             :   setTargetDAGCombine(ISD::SMAX);
     584             :   setTargetDAGCombine(ISD::UMIN);
     585             :   setTargetDAGCombine(ISD::UMAX);
     586             :   setTargetDAGCombine(ISD::SETCC);
     587             :   setTargetDAGCombine(ISD::AND);
     588             :   setTargetDAGCombine(ISD::OR);
     589             :   setTargetDAGCombine(ISD::XOR);
     590             :   setTargetDAGCombine(ISD::SINT_TO_FP);
     591             :   setTargetDAGCombine(ISD::UINT_TO_FP);
     592             :   setTargetDAGCombine(ISD::FCANONICALIZE);
     593             :   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
     594             :   setTargetDAGCombine(ISD::ZERO_EXTEND);
     595             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     596             :   setTargetDAGCombine(ISD::BUILD_VECTOR);
     597             : 
     598             :   // All memory operations. Some folding on the pointer operand is done to help
     599             :   // matching the constant offsets in the addressing modes.
     600             :   setTargetDAGCombine(ISD::LOAD);
     601             :   setTargetDAGCombine(ISD::STORE);
     602             :   setTargetDAGCombine(ISD::ATOMIC_LOAD);
     603             :   setTargetDAGCombine(ISD::ATOMIC_STORE);
     604             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
     605             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     606             :   setTargetDAGCombine(ISD::ATOMIC_SWAP);
     607             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
     608             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
     609             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
     610             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
     611             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
     612             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
     613             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
     614             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
     615             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
     616             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
     617             : 
     618             :   setSchedulingPreference(Sched::RegPressure);
     619        2241 : }
     620             : 
     621      961537 : const SISubtarget *SITargetLowering::getSubtarget() const {
     622      961537 :   return static_cast<const SISubtarget *>(Subtarget);
     623             : }
     624             : 
     625             : //===----------------------------------------------------------------------===//
     626             : // TargetLowering queries
     627             : //===----------------------------------------------------------------------===//
     628             : 
     629             : // v_mad_mix* support a conversion from f16 to f32.
     630             : //
     631             : // There is only one special case when denormals are enabled we don't currently,
     632             : // where this is OK to use.
     633          24 : bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
     634             :                                            EVT DestVT, EVT SrcVT) const {
     635          22 :   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
     636           2 :           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
     637          70 :          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
     638          59 :          SrcVT.getScalarType() == MVT::f16;
     639             : }
     640             : 
     641          28 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
     642             :   // SI has some legal vector types, but no legal vector operations. Say no
     643             :   // shuffles are legal in order to prefer scalarizing some vector operations.
     644          28 :   return false;
     645             : }
     646             : 
     647        9078 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     648             :                                           const CallInst &CI,
     649             :                                           MachineFunction &MF,
     650             :                                           unsigned IntrID) const {
     651        9078 :   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
     652        9078 :           AMDGPU::lookupRsrcIntrinsicByIntr(IntrID)) {
     653             :     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
     654        1265 :                                                   (Intrinsic::ID)IntrID);
     655        1265 :     if (Attr.hasFnAttribute(Attribute::ReadNone))
     656             :       return false;
     657             : 
     658        1226 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     659             : 
     660        1226 :     if (RsrcIntr->IsImage) {
     661         759 :       Info.ptrVal = MFI->getImagePSV(
     662         759 :         *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     663         759 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     664         759 :       Info.align = 0;
     665             :     } else {
     666         467 :       Info.ptrVal = MFI->getBufferPSV(
     667         467 :         *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     668         467 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     669             :     }
     670             : 
     671        1226 :     Info.flags = MachineMemOperand::MODereferenceable;
     672        1226 :     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
     673         826 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     674         826 :       Info.memVT = MVT::getVT(CI.getType());
     675             :       Info.flags |= MachineMemOperand::MOLoad;
     676         400 :     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
     677         303 :       Info.opc = ISD::INTRINSIC_VOID;
     678         606 :       Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
     679             :       Info.flags |= MachineMemOperand::MOStore;
     680             :     } else {
     681             :       // Atomic
     682          97 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     683          97 :       Info.memVT = MVT::getVT(CI.getType());
     684             :       Info.flags = MachineMemOperand::MOLoad |
     685             :                    MachineMemOperand::MOStore |
     686             :                    MachineMemOperand::MODereferenceable;
     687             : 
     688             :       // XXX - Should this be volatile without known ordering?
     689             :       Info.flags |= MachineMemOperand::MOVolatile;
     690             :     }
     691             :     return true;
     692             :   }
     693             : 
     694             :   switch (IntrID) {
     695         245 :   case Intrinsic::amdgcn_atomic_inc:
     696             :   case Intrinsic::amdgcn_atomic_dec:
     697             :   case Intrinsic::amdgcn_ds_fadd:
     698             :   case Intrinsic::amdgcn_ds_fmin:
     699             :   case Intrinsic::amdgcn_ds_fmax: {
     700         245 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     701         245 :     Info.memVT = MVT::getVT(CI.getType());
     702         245 :     Info.ptrVal = CI.getOperand(0);
     703         245 :     Info.align = 0;
     704         245 :     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     705             : 
     706             :     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     707         242 :     if (!Vol || !Vol->isZero())
     708             :       Info.flags |= MachineMemOperand::MOVolatile;
     709             : 
     710             :     return true;
     711             :   }
     712             : 
     713             :   default:
     714             :     return false;
     715             :   }
     716             : }
     717             : 
     718       13360 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
     719             :                                             SmallVectorImpl<Value*> &Ops,
     720             :                                             Type *&AccessTy) const {
     721             :   switch (II->getIntrinsicID()) {
     722         269 :   case Intrinsic::amdgcn_atomic_inc:
     723             :   case Intrinsic::amdgcn_atomic_dec:
     724             :   case Intrinsic::amdgcn_ds_fadd:
     725             :   case Intrinsic::amdgcn_ds_fmin:
     726             :   case Intrinsic::amdgcn_ds_fmax: {
     727         538 :     Value *Ptr = II->getArgOperand(0);
     728         269 :     AccessTy = II->getType();
     729         269 :     Ops.push_back(Ptr);
     730             :     return true;
     731             :   }
     732             :   default:
     733             :     return false;
     734             :   }
     735             : }
     736             : 
     737       36844 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
     738       36844 :   if (!Subtarget->hasFlatInstOffsets()) {
     739             :     // Flat instructions do not have offsets, and only have the register
     740             :     // address.
     741       35364 :     return AM.BaseOffs == 0 && AM.Scale == 0;
     742             :   }
     743             : 
     744             :   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
     745             :   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
     746             : 
     747             :   // Just r + i
     748        1480 :   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
     749             : }
     750             : 
     751       81727 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
     752       81727 :   if (Subtarget->hasFlatGlobalInsts())
     753       30152 :     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
     754             : 
     755       66651 :   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
     756             :       // Assume the we will use FLAT for all global memory accesses
     757             :       // on VI.
     758             :       // FIXME: This assumption is currently wrong.  On VI we still use
     759             :       // MUBUF instructions for the r + i addressing mode.  As currently
     760             :       // implemented, the MUBUF instructions only work on buffer < 4GB.
     761             :       // It may be possible to support > 4GB buffers with MUBUF instructions,
     762             :       // by setting the stride value in the resource descriptor which would
     763             :       // increase the size limit to (stride * 4GB).  However, this is risky,
     764             :       // because it has never been validated.
     765       29386 :     return isLegalFlatAddressingMode(AM);
     766             :   }
     767             : 
     768       37265 :   return isLegalMUBUFAddressingMode(AM);
     769             : }
     770             : 
     771       43219 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
     772             :   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     773             :   // additionally can do r + r + i with addr64. 32-bit has more addressing
     774             :   // mode options. Depending on the resource constant, it can also do
     775             :   // (i64 r0) + (i32 r1) * (i14 i).
     776             :   //
     777             :   // Private arrays end up using a scratch buffer most of the time, so also
     778             :   // assume those use MUBUF instructions. Scratch loads / stores are currently
     779             :   // implemented as mubuf instructions with offen bit set, so slightly
     780             :   // different than the normal addr64.
     781       43219 :   if (!isUInt<12>(AM.BaseOffs))
     782             :     return false;
     783             : 
     784             :   // FIXME: Since we can split immediate into soffset and immediate offset,
     785             :   // would it make sense to allow any immediate?
     786             : 
     787       42700 :   switch (AM.Scale) {
     788             :   case 0: // r + i or just i, depending on HasBaseReg.
     789             :     return true;
     790             :   case 1:
     791             :     return true; // We have r + r or r + i.
     792         859 :   case 2:
     793         859 :     if (AM.HasBaseReg) {
     794             :       // Reject 2 * r + r.
     795             :       return false;
     796             :     }
     797             : 
     798             :     // Allow 2 * r as r + r
     799             :     // Or  2 * r + i is allowed as r + r + i.
     800           0 :     return true;
     801       13432 :   default: // Don't allow n * r
     802       13432 :     return false;
     803             :   }
     804             : }
     805             : 
     806      121642 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     807             :                                              const AddrMode &AM, Type *Ty,
     808             :                                              unsigned AS, Instruction *I) const {
     809             :   // No global is ever allowed as a base.
     810      121642 :   if (AM.BaseGV)
     811             :     return false;
     812             : 
     813      119048 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     814       81482 :     return isLegalGlobalAddressingMode(AM);
     815             : 
     816       75132 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
     817       37566 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
     818             :     // If the offset isn't a multiple of 4, it probably isn't going to be
     819             :     // correctly aligned.
     820             :     // FIXME: Can we get the real alignment here?
     821        4064 :     if (AM.BaseOffs % 4 != 0)
     822          99 :       return isLegalMUBUFAddressingMode(AM);
     823             : 
     824             :     // There are no SMRD extloads, so if we have to do a small type access we
     825             :     // will use a MUBUF load.
     826             :     // FIXME?: We also need to do this if unaligned, but we don't know the
     827             :     // alignment here.
     828        7847 :     if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
     829         245 :       return isLegalGlobalAddressingMode(AM);
     830             : 
     831        3720 :     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     832             :       // SMRD instructions have an 8-bit, dword offset on SI.
     833        1195 :       if (!isUInt<8>(AM.BaseOffs / 4))
     834             :         return false;
     835        2525 :     } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
     836             :       // On CI+, this can also be a 32-bit literal constant offset. If it fits
     837             :       // in 8-bits, it can use a smaller encoding.
     838         974 :       if (!isUInt<32>(AM.BaseOffs / 4))
     839             :         return false;
     840        1551 :     } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     841             :       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
     842        1551 :       if (!isUInt<20>(AM.BaseOffs))
     843             :         return false;
     844             :     } else
     845           0 :       llvm_unreachable("unhandled generation");
     846             : 
     847        3543 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     848             :       return true;
     849             : 
     850         384 :     if (AM.Scale == 1 && AM.HasBaseReg)
     851             :       return true;
     852             : 
     853         349 :     return false;
     854             : 
     855       33502 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     856        5855 :     return isLegalMUBUFAddressingMode(AM);
     857       35107 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
     858        7460 :              AS == AMDGPUASI.REGION_ADDRESS) {
     859             :     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     860             :     // field.
     861             :     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
     862             :     // an 8-bit dword offset but we don't know the alignment here.
     863       20189 :     if (!isUInt<16>(AM.BaseOffs))
     864             :       return false;
     865             : 
     866       18403 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     867             :       return true;
     868             : 
     869        3616 :     if (AM.Scale == 1 && AM.HasBaseReg)
     870             :       return true;
     871             : 
     872        2257 :     return false;
     873        7458 :   } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
     874             :              AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     875             :     // For an unknown address space, this usually means that this is for some
     876             :     // reason being used for pure arithmetic, and not based on some addressing
     877             :     // computation. We don't have instructions that compute pointers with any
     878             :     // addressing modes, so treat them as having no offset like flat
     879             :     // instructions.
     880        7458 :     return isLegalFlatAddressingMode(AM);
     881             :   } else {
     882           0 :     llvm_unreachable("unhandled address space");
     883             :   }
     884             : }
     885             : 
     886       13840 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
     887             :                                         const SelectionDAG &DAG) const {
     888       13840 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
     889        7668 :     return (MemVT.getSizeInBits() <= 4 * 32);
     890        6172 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     891        1166 :     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
     892        1166 :     return (MemVT.getSizeInBits() <= MaxPrivateBits);
     893        5006 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     894        5006 :     return (MemVT.getSizeInBits() <= 2 * 32);
     895             :   }
     896             :   return true;
     897             : }
     898             : 
     899      109435 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     900             :                                                       unsigned AddrSpace,
     901             :                                                       unsigned Align,
     902             :                                                       bool *IsFast) const {
     903      109435 :   if (IsFast)
     904       51797 :     *IsFast = false;
     905             : 
     906             :   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
     907             :   // which isn't a simple VT.
     908             :   // Until MVT is extended to handle this, simply check for the size and
     909             :   // rely on the condition below: allow accesses if the size is a multiple of 4.
     910      109435 :   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
     911             :                            VT.getStoreSize() > 16)) {
     912             :     return false;
     913             :   }
     914             : 
     915      210569 :   if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
     916      101134 :       AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     917             :     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     918             :     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     919             :     // with adjacent offsets.
     920        8301 :     bool AlignedBy4 = (Align % 4 == 0);
     921        8301 :     if (IsFast)
     922        5987 :       *IsFast = AlignedBy4;
     923             : 
     924             :     return AlignedBy4;
     925             :   }
     926             : 
     927             :   // FIXME: We have to be conservative here and assume that flat operations
     928             :   // will access scratch.  If we had access to the IR function, then we
     929             :   // could determine if any private memory was used in the function.
     930      202236 :   if (!Subtarget->hasUnalignedScratchAccess() &&
     931      201894 :       (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
     932      100792 :        AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     933             :     return false;
     934             :   }
     935             : 
     936      100800 :   if (Subtarget->hasUnalignedBufferAccess()) {
     937             :     // If we have an uniform constant load, it still requires using a slow
     938             :     // buffer instruction if unaligned.
     939        5416 :     if (IsFast) {
     940       11373 :       *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
     941        7749 :                  AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
     942         167 :         (Align % 4 == 0) : true;
     943             :     }
     944             : 
     945             :     return true;
     946             :   }
     947             : 
     948             :   // Smaller than dword value must be aligned.
     949       95384 :   if (VT.bitsLT(MVT::i32))
     950             :     return false;
     951             : 
     952             :   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
     953             :   // byte-address are ignored, thus forcing Dword alignment.
     954             :   // This applies to private, global, and constant memory.
     955       93633 :   if (IsFast)
     956       40187 :     *IsFast = true;
     957             : 
     958       93633 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
     959             : }
     960             : 
     961         112 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
     962             :                                           unsigned SrcAlign, bool IsMemset,
     963             :                                           bool ZeroMemset,
     964             :                                           bool MemcpyStrSrc,
     965             :                                           MachineFunction &MF) const {
     966             :   // FIXME: Should account for address space here.
     967             : 
     968             :   // The default fallback uses the private pointer size as a guess for a type to
     969             :   // use. Make sure we switch these to 64-bit accesses.
     970             : 
     971         112 :   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
     972          86 :     return MVT::v4i32;
     973             : 
     974          26 :   if (Size >= 8 && DstAlign >= 4)
     975           8 :     return MVT::v2i32;
     976             : 
     977             :   // Use the default.
     978          18 :   return MVT::Other;
     979             : }
     980             : 
     981             : static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
     982         482 :   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
     983             :          AS == AMDGPUASI.FLAT_ADDRESS ||
     984         872 :          AS == AMDGPUASI.CONSTANT_ADDRESS ||
     985         196 :          AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
     986             : }
     987             : 
     988         219 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
     989             :                                            unsigned DestAS) const {
     990         219 :   return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
     991         219 :          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
     992             : }
     993             : 
     994        3618 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
     995             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
     996        3618 :   const Value *Ptr = MemNode->getMemOperand()->getValue();
     997             :   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
     998        5556 :   return I && I->getMetadata("amdgpu.noclobber");
     999             : }
    1000             : 
    1001          77 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
    1002             :                                             unsigned DestAS) const {
    1003             :   // Flat -> private/local is a simple truncate.
    1004             :   // Flat -> global is no-op
    1005          77 :   if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
    1006             :     return true;
    1007             : 
    1008          21 :   return isNoopAddrSpaceCast(SrcAS, DestAS);
    1009             : }
    1010             : 
    1011           0 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
    1012             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1013             : 
    1014           0 :   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
    1015             : }
    1016             : 
    1017             : TargetLoweringBase::LegalizeTypeAction
    1018      186093 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
    1019      338571 :   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
    1020             :     return TypeSplitVector;
    1021             : 
    1022             :   return TargetLoweringBase::getPreferredVectorAction(VT);
    1023             : }
    1024             : 
    1025          32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    1026             :                                                          Type *Ty) const {
    1027             :   // FIXME: Could be smarter if called for vector constants.
    1028          32 :   return true;
    1029             : }
    1030             : 
    1031      300913 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
    1032      300913 :   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
    1033       30704 :     switch (Op) {
    1034             :     case ISD::LOAD:
    1035             :     case ISD::STORE:
    1036             : 
    1037             :     // These operations are done with 32-bit instructions anyway.
    1038             :     case ISD::AND:
    1039             :     case ISD::OR:
    1040             :     case ISD::XOR:
    1041             :     case ISD::SELECT:
    1042             :       // TODO: Extensions?
    1043             :       return true;
    1044       26815 :     default:
    1045       26815 :       return false;
    1046             :     }
    1047             :   }
    1048             : 
    1049             :   // SimplifySetCC uses this function to determine whether or not it should
    1050             :   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
    1051         985 :   if (VT == MVT::i1 && Op == ISD::SETCC)
    1052             :     return false;
    1053             : 
    1054             :   return TargetLowering::isTypeDesirableForOp(Op, VT);
    1055             : }
    1056             : 
    1057       35723 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
    1058             :                                                    const SDLoc &SL,
    1059             :                                                    SDValue Chain,
    1060             :                                                    uint64_t Offset) const {
    1061       35723 :   const DataLayout &DL = DAG.getDataLayout();
    1062       35723 :   MachineFunction &MF = DAG.getMachineFunction();
    1063       35723 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1064             : 
    1065             :   const ArgDescriptor *InputPtrReg;
    1066             :   const TargetRegisterClass *RC;
    1067             : 
    1068             :   std::tie(InputPtrReg, RC)
    1069             :     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1070             : 
    1071       35723 :   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    1072             :   MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
    1073             :   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
    1074       35723 :     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
    1075             : 
    1076       35723 :   return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
    1077             : }
    1078             : 
    1079          40 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
    1080             :                                             const SDLoc &SL) const {
    1081          40 :   auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
    1082          40 :   uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
    1083          40 :   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
    1084             : }
    1085             : 
    1086       35683 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
    1087             :                                          const SDLoc &SL, SDValue Val,
    1088             :                                          bool Signed,
    1089             :                                          const ISD::InputArg *Arg) const {
    1090      106910 :   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
    1091          83 :       VT.bitsLT(MemVT)) {
    1092          46 :     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
    1093          46 :     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
    1094             :   }
    1095             : 
    1096       35683 :   if (MemVT.isFloatingPoint())
    1097        2417 :     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
    1098       33266 :   else if (Signed)
    1099          16 :     Val = DAG.getSExtOrTrunc(Val, SL, VT);
    1100             :   else
    1101       33250 :     Val = DAG.getZExtOrTrunc(Val, SL, VT);
    1102             : 
    1103       35683 :   return Val;
    1104             : }
    1105             : 
    1106       35683 : SDValue SITargetLowering::lowerKernargMemParameter(
    1107             :   SelectionDAG &DAG, EVT VT, EVT MemVT,
    1108             :   const SDLoc &SL, SDValue Chain,
    1109             :   uint64_t Offset, unsigned Align, bool Signed,
    1110             :   const ISD::InputArg *Arg) const {
    1111       35683 :   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
    1112       35683 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    1113       35683 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    1114             : 
    1115             : 
    1116             :   // Try to avoid using an extload by loading earlier than the argument address,
    1117             :   // and extracting the relevant bits. The load should hopefully be merged with
    1118             :   // the previous argument.
    1119       35683 :   if (Align < 4) {
    1120             :     //if (MemVT.getStoreSize() < 4) {
    1121             :     assert(MemVT.getStoreSize() < 4);
    1122             :     int64_t AlignDownOffset = alignDown(Offset, 4);
    1123        1214 :     int64_t OffsetDiff = Offset - AlignDownOffset;
    1124             : 
    1125        1214 :     EVT IntVT = MemVT.changeTypeToInteger();
    1126             : 
    1127             :     // TODO: If we passed in the base kernel offset we could have a better
    1128             :     // alignment than 4, but we don't really need it.
    1129        1214 :     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
    1130             :     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
    1131             :                                MachineMemOperand::MODereferenceable |
    1132        1214 :                                MachineMemOperand::MOInvariant);
    1133             : 
    1134        1214 :     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
    1135        1214 :     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
    1136             : 
    1137        1214 :     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
    1138        1214 :     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
    1139        1214 :     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
    1140             : 
    1141             : 
    1142        2428 :     return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
    1143             :   }
    1144             : 
    1145       34469 :   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
    1146             :   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
    1147             :                              MachineMemOperand::MODereferenceable |
    1148       34469 :                              MachineMemOperand::MOInvariant);
    1149             : 
    1150       34469 :   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
    1151       68938 :   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
    1152             : }
    1153             : 
    1154         216 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
    1155             :                                               const SDLoc &SL, SDValue Chain,
    1156             :                                               const ISD::InputArg &Arg) const {
    1157         216 :   MachineFunction &MF = DAG.getMachineFunction();
    1158         216 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1159             : 
    1160         216 :   if (Arg.Flags.isByVal()) {
    1161          67 :     unsigned Size = Arg.Flags.getByValSize();
    1162          67 :     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
    1163          67 :     return DAG.getFrameIndex(FrameIdx, MVT::i32);
    1164             :   }
    1165             : 
    1166         149 :   unsigned ArgOffset = VA.getLocMemOffset();
    1167         298 :   unsigned ArgSize = VA.getValVT().getStoreSize();
    1168             : 
    1169         149 :   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
    1170             : 
    1171             :   // Create load nodes to retrieve arguments from the stack.
    1172         149 :   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
    1173             :   SDValue ArgValue;
    1174             : 
    1175             :   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
    1176             :   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
    1177             :   MVT MemVT = VA.getValVT();
    1178             : 
    1179         149 :   switch (VA.getLocInfo()) {
    1180             :   default:
    1181             :     break;
    1182           0 :   case CCValAssign::BCvt:
    1183             :     MemVT = VA.getLocVT();
    1184           0 :     break;
    1185           0 :   case CCValAssign::SExt:
    1186             :     ExtType = ISD::SEXTLOAD;
    1187           0 :     break;
    1188           0 :   case CCValAssign::ZExt:
    1189             :     ExtType = ISD::ZEXTLOAD;
    1190           0 :     break;
    1191           3 :   case CCValAssign::AExt:
    1192             :     ExtType = ISD::EXTLOAD;
    1193           3 :     break;
    1194             :   }
    1195             : 
    1196         149 :   ArgValue = DAG.getExtLoad(
    1197             :     ExtType, SL, VA.getLocVT(), Chain, FIN,
    1198             :     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
    1199         298 :     MemVT);
    1200         149 :   return ArgValue;
    1201             : }
    1202             : 
    1203         192 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
    1204             :   const SIMachineFunctionInfo &MFI,
    1205             :   EVT VT,
    1206             :   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
    1207             :   const ArgDescriptor *Reg;
    1208             :   const TargetRegisterClass *RC;
    1209             : 
    1210             :   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
    1211         192 :   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
    1212             : }
    1213             : 
    1214        1075 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
    1215             :                                    CallingConv::ID CallConv,
    1216             :                                    ArrayRef<ISD::InputArg> Ins,
    1217             :                                    BitVector &Skipped,
    1218             :                                    FunctionType *FType,
    1219             :                                    SIMachineFunctionInfo *Info) {
    1220        5406 :   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    1221        4331 :     const ISD::InputArg &Arg = Ins[I];
    1222             : 
    1223             :     // First check if it's a PS input addr.
    1224        6318 :     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
    1225        6870 :         !Arg.Flags.isByVal() && PSInputNum <= 15) {
    1226             : 
    1227        4656 :       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
    1228             :         // We can safely skip PS inputs.
    1229             :         Skipped.set(I);
    1230        1001 :         ++PSInputNum;
    1231        1001 :         continue;
    1232             :       }
    1233             : 
    1234             :       Info->markPSInputAllocated(PSInputNum);
    1235        1538 :       if (Arg.Used)
    1236             :         Info->markPSInputEnabled(PSInputNum);
    1237             : 
    1238        1538 :       ++PSInputNum;
    1239             :     }
    1240             : 
    1241             :     // Second split vertices into their elements.
    1242        3330 :     if (Arg.VT.isVector()) {
    1243        1002 :       ISD::InputArg NewArg = Arg;
    1244             :       NewArg.Flags.setSplit();
    1245        1002 :       NewArg.VT = Arg.VT.getVectorElementType();
    1246             : 
    1247             :       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
    1248             :       // three or five element vertex only needs three or five registers,
    1249             :       // NOT four or eight.
    1250        1002 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1251             :       unsigned NumElements = ParamType->getVectorNumElements();
    1252             : 
    1253       11030 :       for (unsigned J = 0; J != NumElements; ++J) {
    1254        5014 :         Splits.push_back(NewArg);
    1255        5014 :         NewArg.PartOffset += NewArg.VT.getStoreSize();
    1256             :       }
    1257             :     } else {
    1258        2328 :       Splits.push_back(Arg);
    1259             :     }
    1260             :   }
    1261        1075 : }
    1262             : 
    1263             : // Allocate special inputs passed in VGPRs.
    1264       16438 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
    1265             :                                            MachineFunction &MF,
    1266             :                                            const SIRegisterInfo &TRI,
    1267             :                                            SIMachineFunctionInfo &Info) {
    1268       16438 :   if (Info.hasWorkItemIDX()) {
    1269             :     unsigned Reg = AMDGPU::VGPR0;
    1270       15363 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1271             : 
    1272       15363 :     CCInfo.AllocateReg(Reg);
    1273             :     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
    1274             :   }
    1275             : 
    1276       16438 :   if (Info.hasWorkItemIDY()) {
    1277             :     unsigned Reg = AMDGPU::VGPR1;
    1278         137 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1279             : 
    1280         137 :     CCInfo.AllocateReg(Reg);
    1281             :     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
    1282             :   }
    1283             : 
    1284       16438 :   if (Info.hasWorkItemIDZ()) {
    1285             :     unsigned Reg = AMDGPU::VGPR2;
    1286          76 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1287             : 
    1288          76 :     CCInfo.AllocateReg(Reg);
    1289             :     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
    1290             :   }
    1291       16438 : }
    1292             : 
    1293             : // Try to allocate a VGPR at the end of the argument list, or if no argument
    1294             : // VGPRs are left allocating a stack slot.
    1295          36 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
    1296             :   ArrayRef<MCPhysReg> ArgVGPRs
    1297          36 :     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    1298             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
    1299          36 :   if (RegIdx == ArgVGPRs.size()) {
    1300             :     // Spill to stack required.
    1301           8 :     int64_t Offset = CCInfo.AllocateStack(4, 4);
    1302             : 
    1303             :     return ArgDescriptor::createStack(Offset);
    1304             :   }
    1305             : 
    1306          28 :   unsigned Reg = ArgVGPRs[RegIdx];
    1307          28 :   Reg = CCInfo.AllocateReg(Reg);
    1308             :   assert(Reg != AMDGPU::NoRegister);
    1309             : 
    1310          28 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1311          28 :   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1312             :   return ArgDescriptor::createRegister(Reg);
    1313             : }
    1314             : 
    1315         119 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
    1316             :                                              const TargetRegisterClass *RC,
    1317             :                                              unsigned NumArgRegs) {
    1318         119 :   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
    1319             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
    1320         119 :   if (RegIdx == ArgSGPRs.size())
    1321           0 :     report_fatal_error("ran out of SGPRs for arguments");
    1322             : 
    1323         119 :   unsigned Reg = ArgSGPRs[RegIdx];
    1324         119 :   Reg = CCInfo.AllocateReg(Reg);
    1325             :   assert(Reg != AMDGPU::NoRegister);
    1326             : 
    1327         119 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1328         119 :   MF.addLiveIn(Reg, RC);
    1329         119 :   return ArgDescriptor::createRegister(Reg);
    1330             : }
    1331             : 
    1332             : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
    1333          62 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
    1334             : }
    1335             : 
    1336             : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
    1337          57 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
    1338             : }
    1339             : 
    1340        1399 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
    1341             :                                       MachineFunction &MF,
    1342             :                                       const SIRegisterInfo &TRI,
    1343             :                                       SIMachineFunctionInfo &Info) {
    1344        1399 :   if (Info.hasWorkItemIDX())
    1345          18 :     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
    1346             : 
    1347        1399 :   if (Info.hasWorkItemIDY())
    1348          10 :     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
    1349             : 
    1350        1399 :   if (Info.hasWorkItemIDZ())
    1351           8 :     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
    1352        1399 : }
    1353             : 
    1354        1399 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
    1355             :                                       MachineFunction &MF,
    1356             :                                       const SIRegisterInfo &TRI,
    1357             :                                       SIMachineFunctionInfo &Info) {
    1358             :   auto &ArgInfo = Info.getArgInfo();
    1359             : 
    1360             :   // TODO: Unify handling with private memory pointers.
    1361             : 
    1362        1399 :   if (Info.hasDispatchPtr())
    1363          10 :     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
    1364             : 
    1365        1399 :   if (Info.hasQueuePtr())
    1366          11 :     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
    1367             : 
    1368        1399 :   if (Info.hasKernargSegmentPtr())
    1369          14 :     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
    1370             : 
    1371        1399 :   if (Info.hasDispatchID())
    1372          10 :     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
    1373             : 
    1374             :   // flat_scratch_init is not applicable for non-kernel functions.
    1375             : 
    1376        1399 :   if (Info.hasWorkGroupIDX())
    1377          22 :     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
    1378             : 
    1379        1399 :   if (Info.hasWorkGroupIDY())
    1380          20 :     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
    1381             : 
    1382        1399 :   if (Info.hasWorkGroupIDZ())
    1383          20 :     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
    1384             : 
    1385        1399 :   if (Info.hasImplicitArgPtr())
    1386          12 :     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
    1387        1399 : }
    1388             : 
    1389             : // Allocate special inputs passed in user SGPRs.
    1390       16438 : static void allocateHSAUserSGPRs(CCState &CCInfo,
    1391             :                                  MachineFunction &MF,
    1392             :                                  const SIRegisterInfo &TRI,
    1393             :                                  SIMachineFunctionInfo &Info) {
    1394       16438 :   if (Info.hasImplicitBufferPtr()) {
    1395           2 :     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
    1396           2 :     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    1397           2 :     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    1398             :   }
    1399             : 
    1400             :   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    1401       16438 :   if (Info.hasPrivateSegmentBuffer()) {
    1402        2051 :     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    1403        2051 :     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    1404        2051 :     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    1405             :   }
    1406             : 
    1407       16438 :   if (Info.hasDispatchPtr()) {
    1408          42 :     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
    1409          42 :     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    1410          42 :     CCInfo.AllocateReg(DispatchPtrReg);
    1411             :   }
    1412             : 
    1413       16438 :   if (Info.hasQueuePtr()) {
    1414          57 :     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
    1415          57 :     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    1416          57 :     CCInfo.AllocateReg(QueuePtrReg);
    1417             :   }
    1418             : 
    1419       16438 :   if (Info.hasKernargSegmentPtr()) {
    1420       14435 :     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
    1421       14435 :     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
    1422       14435 :     CCInfo.AllocateReg(InputPtrReg);
    1423             :   }
    1424             : 
    1425       16438 :   if (Info.hasDispatchID()) {
    1426           5 :     unsigned DispatchIDReg = Info.addDispatchID(TRI);
    1427           5 :     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    1428           5 :     CCInfo.AllocateReg(DispatchIDReg);
    1429             :   }
    1430             : 
    1431       16438 :   if (Info.hasFlatScratchInit()) {
    1432         359 :     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    1433         359 :     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    1434         359 :     CCInfo.AllocateReg(FlatScratchInitReg);
    1435             :   }
    1436             : 
    1437             :   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    1438             :   // these from the dispatch pointer.
    1439       16438 : }
    1440             : 
    1441             : // Allocate special input registers that are initialized per-wave.
    1442       16438 : static void allocateSystemSGPRs(CCState &CCInfo,
    1443             :                                 MachineFunction &MF,
    1444             :                                 SIMachineFunctionInfo &Info,
    1445             :                                 CallingConv::ID CallConv,
    1446             :                                 bool IsShader) {
    1447       16438 :   if (Info.hasWorkGroupIDX()) {
    1448       15363 :     unsigned Reg = Info.addWorkGroupIDX();
    1449       15363 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1450       15363 :     CCInfo.AllocateReg(Reg);
    1451             :   }
    1452             : 
    1453       16438 :   if (Info.hasWorkGroupIDY()) {
    1454          24 :     unsigned Reg = Info.addWorkGroupIDY();
    1455          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1456          24 :     CCInfo.AllocateReg(Reg);
    1457             :   }
    1458             : 
    1459       16438 :   if (Info.hasWorkGroupIDZ()) {
    1460          24 :     unsigned Reg = Info.addWorkGroupIDZ();
    1461          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1462          24 :     CCInfo.AllocateReg(Reg);
    1463             :   }
    1464             : 
    1465       16438 :   if (Info.hasWorkGroupInfo()) {
    1466           0 :     unsigned Reg = Info.addWorkGroupInfo();
    1467           0 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1468           0 :     CCInfo.AllocateReg(Reg);
    1469             :   }
    1470             : 
    1471       16438 :   if (Info.hasPrivateSegmentWaveByteOffset()) {
    1472             :     // Scratch wave offset passed in system SGPR.
    1473             :     unsigned PrivateSegmentWaveByteOffsetReg;
    1474             : 
    1475       15413 :     if (IsShader) {
    1476             :       PrivateSegmentWaveByteOffsetReg =
    1477             :         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
    1478             : 
    1479             :       // This is true if the scratch wave byte offset doesn't have a fixed
    1480             :       // location.
    1481          50 :       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
    1482             :         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
    1483             :         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
    1484             :       }
    1485             :     } else
    1486       15363 :       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
    1487             : 
    1488       15413 :     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
    1489       15413 :     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    1490             :   }
    1491       16438 : }
    1492             : 
    1493       16459 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
    1494             :                                      MachineFunction &MF,
    1495             :                                      const SIRegisterInfo &TRI,
    1496             :                                      SIMachineFunctionInfo &Info) {
    1497             :   // Now that we've figured out where the scratch register inputs are, see if
    1498             :   // should reserve the arguments and use them directly.
    1499       16459 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1500             :   bool HasStackObjects = MFI.hasStackObjects();
    1501             : 
    1502             :   // Record that we know we have non-spill stack objects so we don't need to
    1503             :   // check all stack objects later.
    1504       16459 :   if (HasStackObjects)
    1505             :     Info.setHasNonSpillStackObjects(true);
    1506             : 
    1507             :   // Everything live out of a block is spilled with fast regalloc, so it's
    1508             :   // almost certain that spilling will be required.
    1509       16459 :   if (TM.getOptLevel() == CodeGenOpt::None)
    1510             :     HasStackObjects = true;
    1511             : 
    1512             :   // For now assume stack access is needed in any callee functions, so we need
    1513             :   // the scratch registers to pass in.
    1514       16274 :   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
    1515             : 
    1516       16459 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1517       16459 :   if (ST.isAmdCodeObjectV2(MF.getFunction())) {
    1518        2054 :     if (RequiresStackAccess) {
    1519             :       // If we have stack objects, we unquestionably need the private buffer
    1520             :       // resource. For the Code Object V2 ABI, this will be the first 4 user
    1521             :       // SGPR inputs. We can reserve those and use them directly.
    1522             : 
    1523             :       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
    1524             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
    1525             :       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
    1526             : 
    1527         456 :       if (MFI.hasCalls()) {
    1528             :         // If we have calls, we need to keep the frame register in a register
    1529             :         // that won't be clobbered by a call, so ensure it is copied somewhere.
    1530             : 
    1531             :         // This is not a problem for the scratch wave offset, because the same
    1532             :         // registers are reserved in all functions.
    1533             : 
    1534             :         // FIXME: Nothing is really ensuring this is a call preserved register,
    1535             :         // it's just selected from the end so it happens to be.
    1536             :         unsigned ReservedOffsetReg
    1537         245 :           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1538             :         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1539             :       } else {
    1540             :         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
    1541             :           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1542             :         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
    1543             :       }
    1544             :     } else {
    1545             :       unsigned ReservedBufferReg
    1546        1598 :         = TRI.reservedPrivateSegmentBufferReg(MF);
    1547             :       unsigned ReservedOffsetReg
    1548        1598 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1549             : 
    1550             :       // We tentatively reserve the last registers (skipping the last two
    1551             :       // which may contain VCC). After register allocation, we'll replace
    1552             :       // these with the ones immediately after those which were really
    1553             :       // allocated. In the prologue copies will be inserted from the argument
    1554             :       // to these reserved registers.
    1555             :       Info.setScratchRSrcReg(ReservedBufferReg);
    1556             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1557             :     }
    1558             :   } else {
    1559       14405 :     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
    1560             : 
    1561             :     // Without HSA, relocations are used for the scratch pointer and the
    1562             :     // buffer resource setup is always inserted in the prologue. Scratch wave
    1563             :     // offset is still in an input SGPR.
    1564             :     Info.setScratchRSrcReg(ReservedBufferReg);
    1565             : 
    1566       14405 :     if (HasStackObjects && !MFI.hasCalls()) {
    1567             :       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
    1568             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1569             :       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
    1570             :     } else {
    1571             :       unsigned ReservedOffsetReg
    1572       14141 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1573             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1574             :     }
    1575             :   }
    1576       16459 : }
    1577             : 
    1578       17655 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
    1579       17655 :   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1580       17655 :   return !Info->isEntryFunction();
    1581             : }
    1582             : 
    1583        1399 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
    1584             : 
    1585        1399 : }
    1586             : 
    1587        1399 : void SITargetLowering::insertCopiesSplitCSR(
    1588             :   MachineBasicBlock *Entry,
    1589             :   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
    1590        1399 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1591             : 
    1592        1399 :   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
    1593        1399 :   if (!IStart)
    1594        1399 :     return;
    1595             : 
    1596           0 :   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    1597           0 :   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
    1598           0 :   MachineBasicBlock::iterator MBBI = Entry->begin();
    1599           0 :   for (const MCPhysReg *I = IStart; *I; ++I) {
    1600             :     const TargetRegisterClass *RC = nullptr;
    1601           0 :     if (AMDGPU::SReg_64RegClass.contains(*I))
    1602             :       RC = &AMDGPU::SGPR_64RegClass;
    1603           0 :     else if (AMDGPU::SReg_32RegClass.contains(*I))
    1604             :       RC = &AMDGPU::SGPR_32RegClass;
    1605             :     else
    1606           0 :       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1607             : 
    1608           0 :     unsigned NewVR = MRI->createVirtualRegister(RC);
    1609             :     // Create copy from CSR to a virtual register.
    1610           0 :     Entry->addLiveIn(*I);
    1611           0 :     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
    1612           0 :       .addReg(*I);
    1613             : 
    1614             :     // Insert the copy-back instructions right before the terminator.
    1615           0 :     for (auto *Exit : Exits)
    1616           0 :       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
    1617           0 :               TII->get(TargetOpcode::COPY), *I)
    1618           0 :         .addReg(NewVR);
    1619             :   }
    1620             : }
    1621             : 
    1622       17840 : SDValue SITargetLowering::LowerFormalArguments(
    1623             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1624             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1625             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1626       17840 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1627             : 
    1628       17840 :   MachineFunction &MF = DAG.getMachineFunction();
    1629       17840 :   const Function &Fn = MF.getFunction();
    1630             :   FunctionType *FType = MF.getFunction().getFunctionType();
    1631       17840 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1632       17840 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1633             : 
    1634       35680 :   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
    1635             :     DiagnosticInfoUnsupported NoGraphicsHSA(
    1636           6 :         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    1637           3 :     DAG.getContext()->diagnose(NoGraphicsHSA);
    1638             :     return DAG.getEntryNode();
    1639             :   }
    1640             : 
    1641             :   // Create stack objects that are used for emitting debugger prologue if
    1642             :   // "amdgpu-debugger-emit-prologue" attribute was specified.
    1643       17837 :   if (ST.debuggerEmitPrologue())
    1644           4 :     createDebuggerPrologueStackObjects(MF);
    1645             : 
    1646             :   SmallVector<ISD::InputArg, 16> Splits;
    1647             :   SmallVector<CCValAssign, 16> ArgLocs;
    1648       17837 :   BitVector Skipped(Ins.size());
    1649             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1650       35674 :                  *DAG.getContext());
    1651             : 
    1652       17837 :   bool IsShader = AMDGPU::isShader(CallConv);
    1653             :   bool IsKernel = AMDGPU::isKernel(CallConv);
    1654       17837 :   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
    1655             : 
    1656       17837 :   if (!IsEntryFunc) {
    1657             :     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
    1658             :     // this when allocating argument fixed offsets.
    1659        1399 :     CCInfo.AllocateStack(4, 4);
    1660             :   }
    1661             : 
    1662       17837 :   if (IsShader) {
    1663        1075 :     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
    1664             : 
    1665             :     // At least one interpolation mode must be enabled or else the GPU will
    1666             :     // hang.
    1667             :     //
    1668             :     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    1669             :     // set PSInputAddr, the user wants to enable some bits after the compilation
    1670             :     // based on run-time states. Since we can't know what the final PSInputEna
    1671             :     // will look like, so we shouldn't do anything here and the user should take
    1672             :     // responsibility for the correct programming.
    1673             :     //
    1674             :     // Otherwise, the following restrictions apply:
    1675             :     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    1676             :     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    1677             :     //   enabled too.
    1678        1075 :     if (CallConv == CallingConv::AMDGPU_PS) {
    1679        1470 :       if ((Info->getPSInputAddr() & 0x7F) == 0 ||
    1680         648 :            ((Info->getPSInputAddr() & 0xF) == 0 &&
    1681             :             Info->isPSInputAllocated(11))) {
    1682             :         CCInfo.AllocateReg(AMDGPU::VGPR0);
    1683             :         CCInfo.AllocateReg(AMDGPU::VGPR1);
    1684             :         Info->markPSInputAllocated(0);
    1685             :         Info->markPSInputEnabled(0);
    1686             :       }
    1687        1650 :       if (Subtarget->isAmdPalOS()) {
    1688             :         // For isAmdPalOS, the user does not enable some bits after compilation
    1689             :         // based on run-time states; the register values being generated here are
    1690             :         // the final ones set in hardware. Therefore we need to apply the
    1691             :         // workaround to PSInputAddr and PSInputEnable together.  (The case where
    1692             :         // a bit is set in PSInputAddr but not PSInputEnable is where the
    1693             :         // frontend set up an input arg for a particular interpolation mode, but
    1694             :         // nothing uses that input arg. Really we should have an earlier pass
    1695             :         // that removes such an arg.)
    1696          10 :         unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
    1697          17 :         if ((PsInputBits & 0x7F) == 0 ||
    1698           7 :             ((PsInputBits & 0xF) == 0 &&
    1699             :              (PsInputBits >> 11 & 1)))
    1700           3 :           Info->markPSInputEnabled(
    1701             :               countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    1702             :       }
    1703             :     }
    1704             : 
    1705             :     assert(!Info->hasDispatchPtr() &&
    1706             :            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
    1707             :            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
    1708             :            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
    1709             :            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
    1710             :            !Info->hasWorkItemIDZ());
    1711       16762 :   } else if (IsKernel) {
    1712             :     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
    1713             :   } else {
    1714        1399 :     Splits.append(Ins.begin(), Ins.end());
    1715             :   }
    1716             : 
    1717       17837 :   if (IsEntryFunc) {
    1718       16438 :     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    1719       16438 :     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
    1720             :   }
    1721             : 
    1722       17837 :   if (IsKernel) {
    1723       15363 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1724             :   } else {
    1725        2474 :     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
    1726        2474 :     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
    1727             :   }
    1728             : 
    1729             :   SmallVector<SDValue, 16> Chains;
    1730             : 
    1731             :   // FIXME: This is the minimum kernel argument alignment. We should improve
    1732             :   // this to the maximum alignment of the arguments.
    1733             :   //
    1734             :   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
    1735             :   // kern arg offset.
    1736             :   const unsigned KernelArgBaseAlign = 16;
    1737       17837 :   const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
    1738             : 
    1739       61299 :    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    1740       43462 :     const ISD::InputArg &Arg = Ins[i];
    1741       44463 :     if (Skipped[i]) {
    1742        2002 :       InVals.push_back(DAG.getUNDEF(Arg.VT));
    1743       38800 :       continue;
    1744             :     }
    1745             : 
    1746       42461 :     CCValAssign &VA = ArgLocs[ArgIdx++];
    1747             :     MVT VT = VA.getLocVT();
    1748             : 
    1749       81371 :     if (IsEntryFunc && VA.isMemLoc()) {
    1750       35580 :       VT = Ins[i].VT;
    1751             :       EVT MemVT = VA.getLocVT();
    1752             : 
    1753       35580 :       const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
    1754       35580 :       Info->setABIArgOffset(Offset + MemVT.getStoreSize());
    1755       35580 :       unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
    1756             : 
    1757             :       // The first 36 bytes of the input buffer contains information about
    1758             :       // thread group and global sizes for clover.
    1759             :       SDValue Arg = lowerKernargMemParameter(
    1760       71160 :         DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
    1761       35580 :       Chains.push_back(Arg.getValue(1));
    1762             : 
    1763             :       auto *ParamTy =
    1764       35580 :         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
    1765       49033 :       if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    1766       44325 :           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    1767             :         // On SI local pointers are just offsets into LDS, so they are always
    1768             :         // less than 16-bits.  On CI and newer they could potentially be
    1769             :         // real pointers, so we can't guarantee their size.
    1770         640 :         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
    1771        1280 :                           DAG.getValueType(MVT::i16));
    1772             :       }
    1773             : 
    1774       35580 :       InVals.push_back(Arg);
    1775       35580 :       continue;
    1776       10432 :     } else if (!IsEntryFunc && VA.isMemLoc()) {
    1777         216 :       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
    1778         216 :       InVals.push_back(Val);
    1779         216 :       if (!Arg.Flags.isByVal())
    1780         149 :         Chains.push_back(Val.getValue(1));
    1781         216 :       continue;
    1782             :     }
    1783             : 
    1784             :     assert(VA.isRegLoc() && "Parameter must be in a register!");
    1785             : 
    1786        6665 :     unsigned Reg = VA.getLocReg();
    1787        6665 :     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    1788             :     EVT ValVT = VA.getValVT();
    1789             : 
    1790        6665 :     Reg = MF.addLiveIn(Reg, RC);
    1791        6665 :     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1792             : 
    1793        6677 :     if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
    1794             :       // The return object should be reasonably addressable.
    1795             : 
    1796             :       // FIXME: This helps when the return is a real sret. If it is a
    1797             :       // automatically inserted sret (i.e. CanLowerReturn returns false), an
    1798             :       // extra copy is inserted in SelectionDAGBuilder which obscures this.
    1799          12 :       unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
    1800          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1801          24 :         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
    1802             :     }
    1803             : 
    1804             :     // If this is an 8 or 16-bit value, it is really passed promoted
    1805             :     // to 32 bits. Insert an assert[sz]ext to capture this, then
    1806             :     // truncate to the right size.
    1807        6665 :     switch (VA.getLocInfo()) {
    1808             :     case CCValAssign::Full:
    1809             :       break;
    1810             :     case CCValAssign::BCvt:
    1811           0 :       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
    1812           0 :       break;
    1813             :     case CCValAssign::SExt:
    1814           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
    1815          14 :                         DAG.getValueType(ValVT));
    1816           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1817           7 :       break;
    1818             :     case CCValAssign::ZExt:
    1819          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1820          24 :                         DAG.getValueType(ValVT));
    1821          12 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1822          12 :       break;
    1823             :     case CCValAssign::AExt:
    1824           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1825           7 :       break;
    1826           0 :     default:
    1827           0 :       llvm_unreachable("Unknown loc info!");
    1828             :     }
    1829             : 
    1830        9995 :     if (IsShader && Arg.VT.isVector()) {
    1831             :       // Build a vector from the registers
    1832        1002 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1833             :       unsigned NumElements = ParamType->getVectorNumElements();
    1834             : 
    1835             :       SmallVector<SDValue, 4> Regs;
    1836        1002 :       Regs.push_back(Val);
    1837        9026 :       for (unsigned j = 1; j != NumElements; ++j) {
    1838        8024 :         Reg = ArgLocs[ArgIdx++].getLocReg();
    1839        4012 :         Reg = MF.addLiveIn(Reg, RC);
    1840             : 
    1841        4012 :         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1842        4012 :         Regs.push_back(Copy);
    1843             :       }
    1844             : 
    1845             :       // Fill up the missing vector elements
    1846        1002 :       NumElements = Arg.VT.getVectorNumElements() - NumElements;
    1847        1002 :       Regs.append(NumElements, DAG.getUNDEF(VT));
    1848             : 
    1849        2004 :       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
    1850             :       continue;
    1851             :     }
    1852             : 
    1853        5663 :     InVals.push_back(Val);
    1854             :   }
    1855             : 
    1856       17837 :   if (!IsEntryFunc) {
    1857             :     // Special inputs come after user arguments.
    1858        1399 :     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    1859             :   }
    1860             : 
    1861             :   // Start adding system SGPRs.
    1862       17837 :   if (IsEntryFunc) {
    1863       16438 :     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
    1864             :   } else {
    1865        1399 :     CCInfo.AllocateReg(Info->getScratchRSrcReg());
    1866        1399 :     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
    1867        1399 :     CCInfo.AllocateReg(Info->getFrameOffsetReg());
    1868        1399 :     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    1869             :   }
    1870             : 
    1871             :   auto &ArgUsageInfo =
    1872       17837 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1873       17837 :   ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
    1874             : 
    1875       17837 :   unsigned StackArgSize = CCInfo.getNextStackOffset();
    1876             :   Info->setBytesInStackArgArea(StackArgSize);
    1877             : 
    1878       17837 :   return Chains.empty() ? Chain :
    1879       32294 :     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    1880             : }
    1881             : 
    1882             : // TODO: If return values can't fit in registers, we should return as many as
    1883             : // possible in registers before passing on stack.
    1884       18333 : bool SITargetLowering::CanLowerReturn(
    1885             :   CallingConv::ID CallConv,
    1886             :   MachineFunction &MF, bool IsVarArg,
    1887             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1888             :   LLVMContext &Context) const {
    1889             :   // Replacing returns with sret/stack usage doesn't make sense for shaders.
    1890             :   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
    1891             :   // for shaders. Vector types should be explicitly handled by CC.
    1892       18333 :   if (AMDGPU::isEntryFunctionCC(CallConv))
    1893             :     return true;
    1894             : 
    1895             :   SmallVector<CCValAssign, 16> RVLocs;
    1896        3784 :   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
    1897        1892 :   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
    1898             : }
    1899             : 
    1900             : SDValue
    1901       17773 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    1902             :                               bool isVarArg,
    1903             :                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    1904             :                               const SmallVectorImpl<SDValue> &OutVals,
    1905             :                               const SDLoc &DL, SelectionDAG &DAG) const {
    1906       17773 :   MachineFunction &MF = DAG.getMachineFunction();
    1907       17773 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1908             : 
    1909             :   if (AMDGPU::isKernel(CallConv)) {
    1910             :     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
    1911       15337 :                                              OutVals, DL, DAG);
    1912             :   }
    1913             : 
    1914        2436 :   bool IsShader = AMDGPU::isShader(CallConv);
    1915             : 
    1916        2436 :   Info->setIfReturnsVoid(Outs.size() == 0);
    1917        2436 :   bool IsWaveEnd = Info->returnsVoid() && IsShader;
    1918             : 
    1919             :   SmallVector<ISD::OutputArg, 48> Splits;
    1920             :   SmallVector<SDValue, 48> SplitVals;
    1921             : 
    1922             :   // Split vectors into their elements.
    1923        4244 :   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
    1924        1808 :     const ISD::OutputArg &Out = Outs[i];
    1925             : 
    1926        2802 :     if (IsShader && Out.VT.isVector()) {
    1927         305 :       MVT VT = Out.VT.getVectorElementType();
    1928         305 :       ISD::OutputArg NewOut = Out;
    1929             :       NewOut.Flags.setSplit();
    1930         305 :       NewOut.VT = VT;
    1931             : 
    1932             :       // We want the original number of vector elements here, e.g.
    1933             :       // three or five, not four or eight.
    1934         305 :       unsigned NumElements = Out.ArgVT.getVectorNumElements();
    1935             : 
    1936        2845 :       for (unsigned j = 0; j != NumElements; ++j) {
    1937             :         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
    1938        2540 :                                    DAG.getConstant(j, DL, MVT::i32));
    1939        1270 :         SplitVals.push_back(Elem);
    1940        1270 :         Splits.push_back(NewOut);
    1941        1270 :         NewOut.PartOffset += NewOut.VT.getStoreSize();
    1942             :       }
    1943             :     } else {
    1944        1503 :       SplitVals.push_back(OutVals[i]);
    1945        1503 :       Splits.push_back(Out);
    1946             :     }
    1947             :   }
    1948             : 
    1949             :   // CCValAssign - represent the assignment of the return value to a location.
    1950             :   SmallVector<CCValAssign, 48> RVLocs;
    1951             : 
    1952             :   // CCState - Info about the registers and stack slots.
    1953             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
    1954        4872 :                  *DAG.getContext());
    1955             : 
    1956             :   // Analyze outgoing return values.
    1957        2436 :   CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
    1958             : 
    1959        2436 :   SDValue Flag;
    1960             :   SmallVector<SDValue, 48> RetOps;
    1961        2436 :   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    1962             : 
    1963             :   // Add return address for callable functions.
    1964        2436 :   if (!Info->isEntryFunction()) {
    1965        1361 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1966             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    1967        2722 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    1968             : 
    1969             :     // FIXME: Should be able to use a vreg here, but need a way to prevent it
    1970             :     // from being allcoated to a CSR.
    1971             : 
    1972             :     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    1973        1361 :                                                 MVT::i64);
    1974             : 
    1975        1361 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
    1976        1361 :     Flag = Chain.getValue(1);
    1977             : 
    1978        1361 :     RetOps.push_back(PhysReturnAddrReg);
    1979             :   }
    1980             : 
    1981             :   // Copy the result values into the output registers.
    1982        2773 :   for (unsigned i = 0, realRVLocIdx = 0;
    1983       10418 :        i != RVLocs.size();
    1984             :        ++i, ++realRVLocIdx) {
    1985             :     CCValAssign &VA = RVLocs[i];
    1986             :     assert(VA.isRegLoc() && "Can only return in registers!");
    1987             :     // TODO: Partially return in registers if return values don't fit.
    1988             : 
    1989        2773 :     SDValue Arg = SplitVals[realRVLocIdx];
    1990             : 
    1991             :     // Copied from other backends.
    1992        2773 :     switch (VA.getLocInfo()) {
    1993             :     case CCValAssign::Full:
    1994             :       break;
    1995             :     case CCValAssign::BCvt:
    1996           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    1997           0 :       break;
    1998             :     case CCValAssign::SExt:
    1999           0 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2000           0 :       break;
    2001             :     case CCValAssign::ZExt:
    2002           0 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2003           0 :       break;
    2004             :     case CCValAssign::AExt:
    2005           3 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2006           3 :       break;
    2007           0 :     default:
    2008           0 :       llvm_unreachable("Unknown loc info!");
    2009             :     }
    2010             : 
    2011        2773 :     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
    2012        2773 :     Flag = Chain.getValue(1);
    2013        2773 :     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    2014             :   }
    2015             : 
    2016             :   // FIXME: Does sret work properly?
    2017        2436 :   if (!Info->isEntryFunction()) {
    2018             :     const SIRegisterInfo *TRI
    2019        1361 :       = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
    2020             :     const MCPhysReg *I =
    2021        1361 :       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
    2022        1361 :     if (I) {
    2023           0 :       for (; *I; ++I) {
    2024           0 :         if (AMDGPU::SReg_64RegClass.contains(*I))
    2025           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
    2026           0 :         else if (AMDGPU::SReg_32RegClass.contains(*I))
    2027           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
    2028             :         else
    2029           0 :           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    2030             :       }
    2031             :     }
    2032             :   }
    2033             : 
    2034             :   // Update chain and glue.
    2035        2436 :   RetOps[0] = Chain;
    2036        2436 :   if (Flag.getNode())
    2037        1965 :     RetOps.push_back(Flag);
    2038             : 
    2039             :   unsigned Opc = AMDGPUISD::ENDPGM;
    2040        2436 :   if (!IsWaveEnd)
    2041        1965 :     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
    2042        2436 :   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
    2043             : }
    2044             : 
    2045         453 : SDValue SITargetLowering::LowerCallResult(
    2046             :     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
    2047             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    2048             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    2049             :     SDValue ThisVal) const {
    2050         453 :   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
    2051             : 
    2052             :   // Assign locations to each value returned by this call.
    2053             :   SmallVector<CCValAssign, 16> RVLocs;
    2054             :   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    2055         906 :                  *DAG.getContext());
    2056         453 :   CCInfo.AnalyzeCallResult(Ins, RetCC);
    2057             : 
    2058             :   // Copy all of the result registers out of their specified physreg.
    2059        1287 :   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    2060         127 :     CCValAssign VA = RVLocs[i];
    2061         127 :     SDValue Val;
    2062             : 
    2063         127 :     if (VA.isRegLoc()) {
    2064         127 :       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
    2065         127 :       Chain = Val.getValue(1);
    2066         127 :       InFlag = Val.getValue(2);
    2067             :     } else if (VA.isMemLoc()) {
    2068           0 :       report_fatal_error("TODO: return values in memory");
    2069             :     } else
    2070             :       llvm_unreachable("unknown argument location type");
    2071             : 
    2072         127 :     switch (VA.getLocInfo()) {
    2073             :     case CCValAssign::Full:
    2074             :       break;
    2075             :     case CCValAssign::BCvt:
    2076           0 :       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
    2077           0 :       break;
    2078             :     case CCValAssign::ZExt:
    2079           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
    2080          14 :                         DAG.getValueType(VA.getValVT()));
    2081           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2082           7 :       break;
    2083             :     case CCValAssign::SExt:
    2084           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
    2085          14 :                         DAG.getValueType(VA.getValVT()));
    2086           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2087           7 :       break;
    2088             :     case CCValAssign::AExt:
    2089           3 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2090           3 :       break;
    2091           0 :     default:
    2092           0 :       llvm_unreachable("Unknown loc info!");
    2093             :     }
    2094             : 
    2095         127 :     InVals.push_back(Val);
    2096             :   }
    2097             : 
    2098         906 :   return Chain;
    2099             : }
    2100             : 
    2101             : // Add code to pass special inputs required depending on used features separate
    2102             : // from the explicit user arguments present in the IR.
    2103         487 : void SITargetLowering::passSpecialInputs(
    2104             :     CallLoweringInfo &CLI,
    2105             :     const SIMachineFunctionInfo &Info,
    2106             :     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    2107             :     SmallVectorImpl<SDValue> &MemOpChains,
    2108             :     SDValue Chain,
    2109             :     SDValue StackPtr) const {
    2110             :   // If we don't have a call site, this was a call inserted by
    2111             :   // legalization. These can never use special inputs.
    2112         487 :   if (!CLI.CS)
    2113           0 :     return;
    2114             : 
    2115             :   const Function *CalleeFunc = CLI.CS.getCalledFunction();
    2116             :   assert(CalleeFunc);
    2117             : 
    2118         487 :   SelectionDAG &DAG = CLI.DAG;
    2119         487 :   const SDLoc &DL = CLI.DL;
    2120             : 
    2121         487 :   const SISubtarget *ST = getSubtarget();
    2122             :   const SIRegisterInfo *TRI = ST->getRegisterInfo();
    2123             : 
    2124             :   auto &ArgUsageInfo =
    2125         487 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    2126             :   const AMDGPUFunctionArgInfo &CalleeArgInfo
    2127             :     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
    2128             : 
    2129             :   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
    2130             : 
    2131             :   // TODO: Unify with private memory register handling. This is complicated by
    2132             :   // the fact that at least in kernels, the input argument is not necessarily
    2133             :   // in the same location as the input.
    2134         487 :   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    2135             :     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    2136             :     AMDGPUFunctionArgInfo::QUEUE_PTR,
    2137             :     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
    2138             :     AMDGPUFunctionArgInfo::DISPATCH_ID,
    2139             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    2140             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    2141             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
    2142             :     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
    2143             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
    2144             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
    2145             :     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    2146             :   };
    2147             : 
    2148       11201 :   for (auto InputID : InputRegs) {
    2149             :     const ArgDescriptor *OutgoingArg;
    2150             :     const TargetRegisterClass *ArgRC;
    2151             : 
    2152       10714 :     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
    2153        5357 :     if (!OutgoingArg)
    2154        5236 :       continue;
    2155             : 
    2156             :     const ArgDescriptor *IncomingArg;
    2157             :     const TargetRegisterClass *IncomingArgRC;
    2158             :     std::tie(IncomingArg, IncomingArgRC)
    2159         242 :       = CallerArgInfo.getPreloadedValue(InputID);
    2160             :     assert(IncomingArgRC == ArgRC);
    2161             : 
    2162             :     // All special arguments are ints for now.
    2163         121 :     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
    2164         121 :     SDValue InputReg;
    2165             : 
    2166         121 :     if (IncomingArg) {
    2167         111 :       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
    2168             :     } else {
    2169             :       // The implicit arg ptr is special because it doesn't have a corresponding
    2170             :       // input for kernels, and is computed from the kernarg segment pointer.
    2171             :       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    2172          10 :       InputReg = getImplicitArgPtr(DAG, DL);
    2173             :     }
    2174             : 
    2175         242 :     if (OutgoingArg->isRegister()) {
    2176         111 :       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
    2177             :     } else {
    2178             :       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
    2179             :                                               InputReg,
    2180          10 :                                               OutgoingArg->getStackOffset());
    2181          10 :       MemOpChains.push_back(ArgStore);
    2182             :     }
    2183             :   }
    2184             : }
    2185             : 
    2186             : static bool canGuaranteeTCO(CallingConv::ID CC) {
    2187          39 :   return CC == CallingConv::Fast;
    2188             : }
    2189             : 
    2190             : /// Return true if we might ever do TCO for calls with this calling convention.
    2191             : static bool mayTailCallThisCC(CallingConv::ID CC) {
    2192          43 :   switch (CC) {
    2193             :   case CallingConv::C:
    2194             :     return true;
    2195             :   default:
    2196             :     return canGuaranteeTCO(CC);
    2197             :   }
    2198             : }
    2199             : 
    2200          43 : bool SITargetLowering::isEligibleForTailCallOptimization(
    2201             :     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    2202             :     const SmallVectorImpl<ISD::OutputArg> &Outs,
    2203             :     const SmallVectorImpl<SDValue> &OutVals,
    2204             :     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    2205          39 :   if (!mayTailCallThisCC(CalleeCC))
    2206             :     return false;
    2207             : 
    2208          43 :   MachineFunction &MF = DAG.getMachineFunction();
    2209          43 :   const Function &CallerF = MF.getFunction();
    2210             :   CallingConv::ID CallerCC = CallerF.getCallingConv();
    2211          43 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2212          43 :   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    2213             : 
    2214             :   // Kernels aren't callable, and don't have a live in return address so it
    2215             :   // doesn't make sense to do a tail call with entry functions.
    2216          43 :   if (!CallerPreserved)
    2217             :     return false;
    2218             : 
    2219             :   bool CCMatch = CallerCC == CalleeCC;
    2220             : 
    2221          40 :   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    2222           0 :     if (canGuaranteeTCO(CalleeCC) && CCMatch)
    2223             :       return true;
    2224             :     return false;
    2225             :   }
    2226             : 
    2227             :   // TODO: Can we handle var args?
    2228          40 :   if (IsVarArg)
    2229             :     return false;
    2230             : 
    2231         136 :   for (const Argument &Arg : CallerF.args()) {
    2232          99 :     if (Arg.hasByValAttr())
    2233             :       return false;
    2234             :   }
    2235             : 
    2236          37 :   LLVMContext &Ctx = *DAG.getContext();
    2237             : 
    2238             :   // Check that the call results are passed in the same way.
    2239          37 :   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
    2240             :                                   CCAssignFnForCall(CalleeCC, IsVarArg),
    2241             :                                   CCAssignFnForCall(CallerCC, IsVarArg)))
    2242             :     return false;
    2243             : 
    2244             :   // The callee has to preserve all registers the caller needs to preserve.
    2245          37 :   if (!CCMatch) {
    2246           0 :     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    2247           0 :     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    2248             :       return false;
    2249             :   }
    2250             : 
    2251             :   // Nothing more to check if the callee is taking no arguments.
    2252          37 :   if (Outs.empty())
    2253             :     return true;
    2254             : 
    2255             :   SmallVector<CCValAssign, 16> ArgLocs;
    2256          66 :   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
    2257             : 
    2258          33 :   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
    2259             : 
    2260          33 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    2261             :   // If the stack arguments for this call do not fit into our own save area then
    2262             :   // the call cannot be made tail.
    2263             :   // TODO: Is this really necessary?
    2264          33 :   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
    2265             :     return false;
    2266             : 
    2267          30 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
    2268          30 :   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
    2269             : }
    2270             : 
    2271          16 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
    2272          16 :   if (!CI->isTailCall())
    2273             :     return false;
    2274             : 
    2275           4 :   const Function *ParentFn = CI->getParent()->getParent();
    2276           4 :   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
    2277             :     return false;
    2278             : 
    2279           1 :   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
    2280           1 :   return (Attr.getValueAsString() != "true");
    2281             : }
    2282             : 
    2283             : // The wave scratch offset register is used as the global base pointer.
    2284         493 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    2285             :                                     SmallVectorImpl<SDValue> &InVals) const {
    2286         493 :   SelectionDAG &DAG = CLI.DAG;
    2287         493 :   const SDLoc &DL = CLI.DL;
    2288             :   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    2289             :   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    2290             :   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
    2291         493 :   SDValue Chain = CLI.Chain;
    2292         493 :   SDValue Callee = CLI.Callee;
    2293             :   bool &IsTailCall = CLI.IsTailCall;
    2294         493 :   CallingConv::ID CallConv = CLI.CallConv;
    2295         493 :   bool IsVarArg = CLI.IsVarArg;
    2296             :   bool IsSibCall = false;
    2297             :   bool IsThisReturn = false;
    2298         493 :   MachineFunction &MF = DAG.getMachineFunction();
    2299             : 
    2300         493 :   if (IsVarArg) {
    2301             :     return lowerUnhandledCall(CLI, InVals,
    2302           2 :                               "unsupported call to variadic function ");
    2303             :   }
    2304             : 
    2305             :   if (!CLI.CS.getCalledFunction()) {
    2306             :     return lowerUnhandledCall(CLI, InVals,
    2307           8 :                               "unsupported indirect call to function ");
    2308             :   }
    2309             : 
    2310         488 :   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
    2311             :     return lowerUnhandledCall(CLI, InVals,
    2312           2 :                               "unsupported required tail call to function ");
    2313             :   }
    2314             : 
    2315             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2316             :   const unsigned CalleeUsableStackOffset = 4;
    2317             : 
    2318         487 :   if (IsTailCall) {
    2319          43 :     IsTailCall = isEligibleForTailCallOptimization(
    2320             :       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
    2321          52 :     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
    2322           0 :       report_fatal_error("failed to perform tail call elimination on a call "
    2323             :                          "site marked musttail");
    2324             :     }
    2325             : 
    2326          43 :     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
    2327             : 
    2328             :     // A sibling call is one where we're under the usual C ABI and not planning
    2329             :     // to change that but can still do a tail call:
    2330          86 :     if (!TailCallOpt && IsTailCall)
    2331             :       IsSibCall = true;
    2332             : 
    2333             :     if (IsTailCall)
    2334             :       ++NumTailCalls;
    2335             :   }
    2336             : 
    2337             :   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
    2338             :     // FIXME: Remove this hack for function pointer types after removing
    2339             :     // support of old address space mapping. In the new address space
    2340             :     // mapping the pointer in default address space is 64 bit, therefore
    2341             :     // does not need this hack.
    2342         487 :     if (Callee.getValueType() == MVT::i32) {
    2343           0 :       const GlobalValue *GV = GA->getGlobal();
    2344           0 :       Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
    2345           0 :                                     GA->getTargetFlags());
    2346             :     }
    2347             :   }
    2348             :   assert(Callee.getValueType() == MVT::i64);
    2349             : 
    2350         487 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2351             : 
    2352             :   // Analyze operands of the call, assigning locations to each operand.
    2353             :   SmallVector<CCValAssign, 16> ArgLocs;
    2354         974 :   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    2355         487 :   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
    2356         487 :   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
    2357             : 
    2358             :   // Get a count of how many bytes are to be pushed on the stack.
    2359         487 :   unsigned NumBytes = CCInfo.getNextStackOffset();
    2360             : 
    2361         487 :   if (IsSibCall) {
    2362             :     // Since we're not changing the ABI to make this a tail call, the memory
    2363             :     // operands are already available in the caller's incoming argument space.
    2364             :     NumBytes = 0;
    2365             :   }
    2366             : 
    2367             :   // FPDiff is the byte offset of the call's argument area from the callee's.
    2368             :   // Stores to callee stack arguments will be placed in FixedStackSlots offset
    2369             :   // by this amount for a tail call. In a sibling call it must be 0 because the
    2370             :   // caller will deallocate the entire stack and the callee still expects its
    2371             :   // arguments to begin at SP+0. Completely unused for non-tail calls.
    2372             :   int32_t FPDiff = 0;
    2373         487 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    2374             :   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    2375             : 
    2376         487 :   SDValue CallerSavedFP;
    2377             : 
    2378             :   // Adjust the stack pointer for the new arguments...
    2379             :   // These operations are automatically eliminated by the prolog/epilog pass
    2380         487 :   if (!IsSibCall) {
    2381         453 :     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
    2382             : 
    2383         453 :     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
    2384             : 
    2385             :     // In the HSA case, this should be an identity copy.
    2386             :     SDValue ScratchRSrcReg
    2387         453 :       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
    2388         453 :     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
    2389             : 
    2390             :     // TODO: Don't hardcode these registers and get from the callee function.
    2391             :     SDValue ScratchWaveOffsetReg
    2392         453 :       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
    2393         453 :     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
    2394             : 
    2395         453 :     if (!Info->isEntryFunction()) {
    2396             :       // Avoid clobbering this function's FP value. In the current convention
    2397             :       // callee will overwrite this, so do save/restore around the call site.
    2398          98 :       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
    2399         196 :                                          Info->getFrameOffsetReg(), MVT::i32);
    2400             :     }
    2401             :   }
    2402             : 
    2403             :   // Stack pointer relative accesses are done by changing the offset SGPR. This
    2404             :   // is just the VGPR offset component.
    2405         487 :   SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
    2406             : 
    2407             :   SmallVector<SDValue, 8> MemOpChains;
    2408             :   MVT PtrVT = MVT::i32;
    2409             : 
    2410             :   // Walk the register/memloc assignments, inserting copies/loads.
    2411        1540 :   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
    2412             :        ++i, ++realArgIdx) {
    2413        1053 :     CCValAssign &VA = ArgLocs[i];
    2414        1053 :     SDValue Arg = OutVals[realArgIdx];
    2415             : 
    2416             :     // Promote the value if needed.
    2417        1053 :     switch (VA.getLocInfo()) {
    2418             :     case CCValAssign::Full:
    2419             :       break;
    2420             :     case CCValAssign::BCvt:
    2421           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2422           0 :       break;
    2423             :     case CCValAssign::ZExt:
    2424          10 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2425          10 :       break;
    2426             :     case CCValAssign::SExt:
    2427          10 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2428          10 :       break;
    2429             :     case CCValAssign::AExt:
    2430           4 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2431           4 :       break;
    2432             :     case CCValAssign::FPExt:
    2433           0 :       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
    2434           0 :       break;
    2435           0 :     default:
    2436           0 :       llvm_unreachable("Unknown loc info!");
    2437             :     }
    2438             : 
    2439        1053 :     if (VA.isRegLoc()) {
    2440        1982 :       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    2441             :     } else {
    2442             :       assert(VA.isMemLoc());
    2443             : 
    2444          62 :       SDValue DstAddr;
    2445             :       MachinePointerInfo DstInfo;
    2446             : 
    2447          62 :       unsigned LocMemOffset = VA.getLocMemOffset();
    2448          62 :       int32_t Offset = LocMemOffset;
    2449             : 
    2450          62 :       SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
    2451             : 
    2452          62 :       if (IsTailCall) {
    2453          27 :         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
    2454          27 :         unsigned OpSize = Flags.isByVal() ?
    2455          51 :           Flags.getByValSize() : VA.getValVT().getStoreSize();
    2456             : 
    2457             :         Offset = Offset + FPDiff;
    2458          27 :         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
    2459             : 
    2460          27 :         DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
    2461          54 :                                          StackPtr);
    2462          27 :         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
    2463             : 
    2464             :         // Make sure any stack arguments overlapping with where we're storing
    2465             :         // are loaded before this eventual operation. Otherwise they'll be
    2466             :         // clobbered.
    2467             : 
    2468             :         // FIXME: Why is this really necessary? This seems to just result in a
    2469             :         // lot of code to copy the stack and write them back to the same
    2470             :         // locations, which are supposed to be immutable?
    2471          27 :         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
    2472             :       } else {
    2473          35 :         DstAddr = PtrOff;
    2474          35 :         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
    2475             :       }
    2476             : 
    2477          62 :       if (Outs[i].Flags.isByVal()) {
    2478             :         SDValue SizeNode =
    2479          28 :             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
    2480             :         SDValue Cpy = DAG.getMemcpy(
    2481             :             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
    2482             :             /*isVol = */ false, /*AlwaysInline = */ true,
    2483             :             /*isTailCall = */ false, DstInfo,
    2484          28 :             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
    2485          56 :                 *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
    2486             : 
    2487          28 :         MemOpChains.push_back(Cpy);
    2488             :       } else {
    2489          34 :         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
    2490          34 :         MemOpChains.push_back(Store);
    2491             :       }
    2492             :     }
    2493             :   }
    2494             : 
    2495             :   // Copy special input registers after user input arguments.
    2496         487 :   passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
    2497             : 
    2498         487 :   if (!MemOpChains.empty())
    2499          46 :     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    2500             : 
    2501             :   // Build a sequence of copy-to-reg nodes chained together with token chain
    2502             :   // and flag operands which copy the outgoing args into the appropriate regs.
    2503         487 :   SDValue InFlag;
    2504        4503 :   for (auto &RegToPass : RegsToPass) {
    2505        2008 :     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
    2506        2008 :                              RegToPass.second, InFlag);
    2507        2008 :     InFlag = Chain.getValue(1);
    2508             :   }
    2509             : 
    2510             : 
    2511         487 :   SDValue PhysReturnAddrReg;
    2512         487 :   if (IsTailCall) {
    2513             :     // Since the return is being combined with the call, we need to pass on the
    2514             :     // return address.
    2515             : 
    2516          34 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2517             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2518          68 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2519             : 
    2520          34 :     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2521          68 :                                         MVT::i64);
    2522          34 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
    2523          34 :     InFlag = Chain.getValue(1);
    2524             :   }
    2525             : 
    2526             :   // We don't usually want to end the call-sequence here because we would tidy
    2527             :   // the frame up *after* the call, however in the ABI-changing tail-call case
    2528             :   // we've carefully laid out the parameters so that when sp is reset they'll be
    2529             :   // in the correct location.
    2530         487 :   if (IsTailCall && !IsSibCall) {
    2531           0 :     Chain = DAG.getCALLSEQ_END(Chain,
    2532             :                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2533             :                                DAG.getTargetConstant(0, DL, MVT::i32),
    2534           0 :                                InFlag, DL);
    2535           0 :     InFlag = Chain.getValue(1);
    2536             :   }
    2537             : 
    2538             :   std::vector<SDValue> Ops;
    2539         487 :   Ops.push_back(Chain);
    2540         487 :   Ops.push_back(Callee);
    2541             : 
    2542         487 :   if (IsTailCall) {
    2543             :     // Each tail call may have to adjust the stack by a different amount, so
    2544             :     // this information must travel along with the operation for eventual
    2545             :     // consumption by emitEpilogue.
    2546          68 :     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
    2547             : 
    2548          34 :     Ops.push_back(PhysReturnAddrReg);
    2549             :   }
    2550             : 
    2551             :   // Add argument registers to the end of the list so that they are known live
    2552             :   // into the call.
    2553        4503 :   for (auto &RegToPass : RegsToPass) {
    2554        4016 :     Ops.push_back(DAG.getRegister(RegToPass.first,
    2555        4016 :                                   RegToPass.second.getValueType()));
    2556             :   }
    2557             : 
    2558             :   // Add a register mask operand representing the call-preserved registers.
    2559             : 
    2560         487 :   const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
    2561         487 :   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
    2562             :   assert(Mask && "Missing call preserved mask for calling convention");
    2563         974 :   Ops.push_back(DAG.getRegisterMask(Mask));
    2564             : 
    2565         487 :   if (InFlag.getNode())
    2566         487 :     Ops.push_back(InFlag);
    2567             : 
    2568         487 :   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    2569             : 
    2570             :   // If we're doing a tall call, use a TC_RETURN here rather than an
    2571             :   // actual call instruction.
    2572         487 :   if (IsTailCall) {
    2573             :     MFI.setHasTailCall();
    2574          34 :     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
    2575             :   }
    2576             : 
    2577             :   // Returns a chain and a flag for retval copy to use.
    2578         453 :   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
    2579         453 :   Chain = Call.getValue(0);
    2580         453 :   InFlag = Call.getValue(1);
    2581             : 
    2582         453 :   if (CallerSavedFP) {
    2583          98 :     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
    2584          98 :     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
    2585          98 :     InFlag = Chain.getValue(1);
    2586             :   }
    2587             : 
    2588         453 :   uint64_t CalleePopBytes = NumBytes;
    2589         453 :   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
    2590             :                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
    2591             :                              InFlag, DL);
    2592         453 :   if (!Ins.empty())
    2593         113 :     InFlag = Chain.getValue(1);
    2594             : 
    2595             :   // Handle result values, copying them out of physregs into vregs that we
    2596             :   // return.
    2597             :   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
    2598             :                          InVals, IsThisReturn,
    2599         453 :                          IsThisReturn ? OutVals[0] : SDValue());
    2600             : }
    2601             : 
    2602          27 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
    2603             :                                              SelectionDAG &DAG) const {
    2604          27 :   unsigned Reg = StringSwitch<unsigned>(RegName)
    2605             :     .Case("m0", AMDGPU::M0)
    2606             :     .Case("exec", AMDGPU::EXEC)
    2607             :     .Case("exec_lo", AMDGPU::EXEC_LO)
    2608             :     .Case("exec_hi", AMDGPU::EXEC_HI)
    2609             :     .Case("flat_scratch", AMDGPU::FLAT_SCR)
    2610             :     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
    2611             :     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
    2612             :     .Default(AMDGPU::NoRegister);
    2613             : 
    2614          27 :   if (Reg == AMDGPU::NoRegister) {
    2615           0 :     report_fatal_error(Twine("invalid register name \""
    2616             :                              + StringRef(RegName)  + "\"."));
    2617             : 
    2618             :   }
    2619             : 
    2620          30 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    2621           3 :       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
    2622           1 :     report_fatal_error(Twine("invalid register \""
    2623             :                              + StringRef(RegName)  + "\" for subtarget."));
    2624             :   }
    2625             : 
    2626          26 :   switch (Reg) {
    2627          17 :   case AMDGPU::M0:
    2628             :   case AMDGPU::EXEC_LO:
    2629             :   case AMDGPU::EXEC_HI:
    2630             :   case AMDGPU::FLAT_SCR_LO:
    2631             :   case AMDGPU::FLAT_SCR_HI:
    2632          17 :     if (VT.getSizeInBits() == 32)
    2633             :       return Reg;
    2634             :     break;
    2635           9 :   case AMDGPU::EXEC:
    2636             :   case AMDGPU::FLAT_SCR:
    2637           9 :     if (VT.getSizeInBits() == 64)
    2638             :       return Reg;
    2639             :     break;
    2640           0 :   default:
    2641           0 :     llvm_unreachable("missing register type checking");
    2642             :   }
    2643             : 
    2644           2 :   report_fatal_error(Twine("invalid type for register \""
    2645             :                            + StringRef(RegName) + "\"."));
    2646             : }
    2647             : 
    2648             : // If kill is not the last instruction, split the block so kill is always a
    2649             : // proper terminator.
    2650          82 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
    2651             :                                                     MachineBasicBlock *BB) const {
    2652          82 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2653             : 
    2654             :   MachineBasicBlock::iterator SplitPoint(&MI);
    2655             :   ++SplitPoint;
    2656             : 
    2657          82 :   if (SplitPoint == BB->end()) {
    2658             :     // Don't bother with a new block.
    2659           8 :     MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2660           4 :     return BB;
    2661             :   }
    2662             : 
    2663          78 :   MachineFunction *MF = BB->getParent();
    2664             :   MachineBasicBlock *SplitBB
    2665          78 :     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
    2666             : 
    2667             :   MF->insert(++MachineFunction::iterator(BB), SplitBB);
    2668             :   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
    2669             : 
    2670          78 :   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
    2671          78 :   BB->addSuccessor(SplitBB);
    2672             : 
    2673         156 :   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2674          78 :   return SplitBB;
    2675             : }
    2676             : 
    2677             : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
    2678             : // wavefront. If the value is uniform and just happens to be in a VGPR, this
    2679             : // will only do one iteration. In the worst case, this will loop 64 times.
    2680             : //
    2681             : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
    2682          32 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
    2683             :   const SIInstrInfo *TII,
    2684             :   MachineRegisterInfo &MRI,
    2685             :   MachineBasicBlock &OrigBB,
    2686             :   MachineBasicBlock &LoopBB,
    2687             :   const DebugLoc &DL,
    2688             :   const MachineOperand &IdxReg,
    2689             :   unsigned InitReg,
    2690             :   unsigned ResultReg,
    2691             :   unsigned PhiReg,
    2692             :   unsigned InitSaveExecReg,
    2693             :   int Offset,
    2694             :   bool UseGPRIdxMode,
    2695             :   bool IsIndirectSrc) {
    2696          32 :   MachineBasicBlock::iterator I = LoopBB.begin();
    2697             : 
    2698          32 :   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2699          32 :   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2700          32 :   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2701          32 :   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2702             : 
    2703          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
    2704          32 :     .addReg(InitReg)
    2705             :     .addMBB(&OrigBB)
    2706          32 :     .addReg(ResultReg)
    2707             :     .addMBB(&LoopBB);
    2708             : 
    2709          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
    2710          32 :     .addReg(InitSaveExecReg)
    2711             :     .addMBB(&OrigBB)
    2712          32 :     .addReg(NewExec)
    2713             :     .addMBB(&LoopBB);
    2714             : 
    2715             :   // Read the next variant <- also loop target.
    2716          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
    2717          32 :     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    2718             : 
    2719             :   // Compare the just read M0 value to all possible Idx values.
    2720          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
    2721          32 :     .addReg(CurrentIdxReg)
    2722          32 :     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
    2723             : 
    2724             :   // Update EXEC, save the original EXEC value to VCC.
    2725          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
    2726          32 :     .addReg(CondReg, RegState::Kill);
    2727             : 
    2728          32 :   MRI.setSimpleHint(NewExec, CondReg);
    2729             : 
    2730          32 :   if (UseGPRIdxMode) {
    2731             :     unsigned IdxReg;
    2732          16 :     if (Offset == 0) {
    2733             :       IdxReg = CurrentIdxReg;
    2734             :     } else {
    2735           6 :       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2736          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
    2737           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2738           6 :         .addImm(Offset);
    2739             :     }
    2740          16 :     unsigned IdxMode = IsIndirectSrc ?
    2741             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2742             :     MachineInstr *SetOn =
    2743          48 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2744          16 :       .addReg(IdxReg, RegState::Kill)
    2745          32 :       .addImm(IdxMode);
    2746          16 :     SetOn->getOperand(3).setIsUndef();
    2747             :   } else {
    2748             :     // Move index from VCC into M0
    2749          16 :     if (Offset == 0) {
    2750          30 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2751          10 :         .addReg(CurrentIdxReg, RegState::Kill);
    2752             :     } else {
    2753          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2754           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2755           6 :         .addImm(Offset);
    2756             :     }
    2757             :   }
    2758             : 
    2759             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    2760             :   MachineInstr *InsertPt =
    2761          96 :     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    2762          32 :     .addReg(AMDGPU::EXEC)
    2763          32 :     .addReg(NewExec);
    2764             : 
    2765             :   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
    2766             :   // s_cbranch_scc0?
    2767             : 
    2768             :   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
    2769          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    2770             :     .addMBB(&LoopBB);
    2771             : 
    2772          32 :   return InsertPt->getIterator();
    2773             : }
    2774             : 
    2775             : // This has slightly sub-optimal regalloc when the source vector is killed by
    2776             : // the read. The register allocator does not understand that the kill is
    2777             : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
    2778             : // subregister from it, using 1 more VGPR than necessary. This was saved when
    2779             : // this was expanded after register allocation.
    2780          32 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    2781             :                                                   MachineBasicBlock &MBB,
    2782             :                                                   MachineInstr &MI,
    2783             :                                                   unsigned InitResultReg,
    2784             :                                                   unsigned PhiReg,
    2785             :                                                   int Offset,
    2786             :                                                   bool UseGPRIdxMode,
    2787             :                                                   bool IsIndirectSrc) {
    2788          32 :   MachineFunction *MF = MBB.getParent();
    2789          32 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2790             :   const DebugLoc &DL = MI.getDebugLoc();
    2791             :   MachineBasicBlock::iterator I(&MI);
    2792             : 
    2793          32 :   unsigned DstReg = MI.getOperand(0).getReg();
    2794          32 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2795          32 :   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2796             : 
    2797          64 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
    2798             : 
    2799             :   // Save the EXEC mask
    2800          96 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
    2801          32 :     .addReg(AMDGPU::EXEC);
    2802             : 
    2803             :   // To insert the loop we need to split the block. Move everything after this
    2804             :   // point to a new block, and insert a new empty block between the two.
    2805          32 :   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    2806          32 :   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    2807             :   MachineFunction::iterator MBBI(MBB);
    2808             :   ++MBBI;
    2809             : 
    2810             :   MF->insert(MBBI, LoopBB);
    2811             :   MF->insert(MBBI, RemainderBB);
    2812             : 
    2813          32 :   LoopBB->addSuccessor(LoopBB);
    2814          32 :   LoopBB->addSuccessor(RemainderBB);
    2815             : 
    2816             :   // Move the rest of the block into a new block.
    2817          32 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    2818             :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    2819             : 
    2820          32 :   MBB.addSuccessor(LoopBB);
    2821             : 
    2822          32 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2823             : 
    2824             :   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
    2825             :                                       InitResultReg, DstReg, PhiReg, TmpExec,
    2826          32 :                                       Offset, UseGPRIdxMode, IsIndirectSrc);
    2827             : 
    2828          32 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    2829          96 :   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    2830          32 :     .addReg(SaveExec);
    2831             : 
    2832          32 :   return InsPt;
    2833             : }
    2834             : 
    2835             : // Returns subreg index, offset
    2836             : static std::pair<unsigned, int>
    2837         161 : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
    2838             :                             const TargetRegisterClass *SuperRC,
    2839             :                             unsigned VecReg,
    2840             :                             int Offset) {
    2841         161 :   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
    2842             : 
    2843             :   // Skip out of bounds offsets, or else we would end up using an undefined
    2844             :   // register.
    2845         161 :   if (Offset >= NumElts || Offset < 0)
    2846          40 :     return std::make_pair(AMDGPU::sub0, Offset);
    2847             : 
    2848         242 :   return std::make_pair(AMDGPU::sub0 + Offset, 0);
    2849             : }
    2850             : 
    2851             : // Return true if the index is an SGPR and was set.
    2852         161 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
    2853             :                                  MachineRegisterInfo &MRI,
    2854             :                                  MachineInstr &MI,
    2855             :                                  int Offset,
    2856             :                                  bool UseGPRIdxMode,
    2857             :                                  bool IsIndirectSrc) {
    2858         161 :   MachineBasicBlock *MBB = MI.getParent();
    2859             :   const DebugLoc &DL = MI.getDebugLoc();
    2860             :   MachineBasicBlock::iterator I(&MI);
    2861             : 
    2862         161 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2863         161 :   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
    2864             : 
    2865             :   assert(Idx->getReg() != AMDGPU::NoRegister);
    2866             : 
    2867         161 :   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
    2868             :     return false;
    2869             : 
    2870         129 :   if (UseGPRIdxMode) {
    2871          31 :     unsigned IdxMode = IsIndirectSrc ?
    2872             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2873          31 :     if (Offset == 0) {
    2874             :       MachineInstr *SetOn =
    2875          34 :           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2876             :               .add(*Idx)
    2877          17 :               .addImm(IdxMode);
    2878             : 
    2879          17 :       SetOn->getOperand(3).setIsUndef();
    2880             :     } else {
    2881          14 :       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    2882          28 :       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
    2883             :           .add(*Idx)
    2884          14 :           .addImm(Offset);
    2885             :       MachineInstr *SetOn =
    2886          42 :         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2887          14 :         .addReg(Tmp, RegState::Kill)
    2888          28 :         .addImm(IdxMode);
    2889             : 
    2890          14 :       SetOn->getOperand(3).setIsUndef();
    2891             :     }
    2892             : 
    2893             :     return true;
    2894             :   }
    2895             : 
    2896          98 :   if (Offset == 0) {
    2897         252 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2898             :       .add(*Idx);
    2899             :   } else {
    2900          28 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2901             :       .add(*Idx)
    2902          14 :       .addImm(Offset);
    2903             :   }
    2904             : 
    2905             :   return true;
    2906             : }
    2907             : 
    2908             : // Control flow needs to be inserted if indexing with a VGPR.
    2909          71 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
    2910             :                                           MachineBasicBlock &MBB,
    2911             :                                           const SISubtarget &ST) {
    2912             :   const SIInstrInfo *TII = ST.getInstrInfo();
    2913             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2914          71 :   MachineFunction *MF = MBB.getParent();
    2915          71 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2916             : 
    2917          71 :   unsigned Dst = MI.getOperand(0).getReg();
    2918          71 :   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
    2919          71 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2920             : 
    2921             :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
    2922             : 
    2923             :   unsigned SubReg;
    2924             :   std::tie(SubReg, Offset)
    2925         142 :     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
    2926             : 
    2927             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    2928             : 
    2929          71 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
    2930             :     MachineBasicBlock::iterator I(&MI);
    2931             :     const DebugLoc &DL = MI.getDebugLoc();
    2932             : 
    2933          59 :     if (UseGPRIdxMode) {
    2934             :       // TODO: Look at the uses to avoid the copy. This may require rescheduling
    2935             :       // to avoid interfering with other uses, so probably requires a new
    2936             :       // optimization pass.
    2937          51 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2938          17 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2939          17 :         .addReg(SrcReg, RegState::Implicit)
    2940          17 :         .addReg(AMDGPU::M0, RegState::Implicit);
    2941          34 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2942             :     } else {
    2943         126 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2944          42 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2945          42 :         .addReg(SrcReg, RegState::Implicit);
    2946             :     }
    2947             : 
    2948          59 :     MI.eraseFromParent();
    2949             : 
    2950             :     return &MBB;
    2951             :   }
    2952             : 
    2953             :   const DebugLoc &DL = MI.getDebugLoc();
    2954             :   MachineBasicBlock::iterator I(&MI);
    2955             : 
    2956          12 :   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2957          12 :   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2958             : 
    2959          24 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
    2960             : 
    2961             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
    2962          12 :                               Offset, UseGPRIdxMode, true);
    2963          12 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    2964             : 
    2965          12 :   if (UseGPRIdxMode) {
    2966          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2967           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    2968           6 :       .addReg(SrcReg, RegState::Implicit)
    2969           6 :       .addReg(AMDGPU::M0, RegState::Implicit);
    2970          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2971             :   } else {
    2972          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2973           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    2974           6 :       .addReg(SrcReg, RegState::Implicit);
    2975             :   }
    2976             : 
    2977          12 :   MI.eraseFromParent();
    2978             : 
    2979          12 :   return LoopBB;
    2980             : }
    2981             : 
    2982          66 : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
    2983             :                                  const TargetRegisterClass *VecRC) {
    2984          66 :   switch (TRI.getRegSizeInBits(*VecRC)) {
    2985             :   case 32: // 4 bytes
    2986             :     return AMDGPU::V_MOVRELD_B32_V1;
    2987           6 :   case 64: // 8 bytes
    2988           6 :     return AMDGPU::V_MOVRELD_B32_V2;
    2989          38 :   case 128: // 16 bytes
    2990          38 :     return AMDGPU::V_MOVRELD_B32_V4;
    2991          16 :   case 256: // 32 bytes
    2992          16 :     return AMDGPU::V_MOVRELD_B32_V8;
    2993           6 :   case 512: // 64 bytes
    2994           6 :     return AMDGPU::V_MOVRELD_B32_V16;
    2995           0 :   default:
    2996           0 :     llvm_unreachable("unsupported size for MOVRELD pseudos");
    2997             :   }
    2998             : }
    2999             : 
    3000          90 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
    3001             :                                           MachineBasicBlock &MBB,
    3002             :                                           const SISubtarget &ST) {
    3003             :   const SIInstrInfo *TII = ST.getInstrInfo();
    3004             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    3005          90 :   MachineFunction *MF = MBB.getParent();
    3006          90 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    3007             : 
    3008          90 :   unsigned Dst = MI.getOperand(0).getReg();
    3009          90 :   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    3010          90 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    3011          90 :   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    3012          90 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    3013          90 :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
    3014             : 
    3015             :   // This can be an immediate, but will be folded later.
    3016             :   assert(Val->getReg());
    3017             : 
    3018             :   unsigned SubReg;
    3019         180 :   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
    3020             :                                                          SrcVec->getReg(),
    3021             :                                                          Offset);
    3022             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    3023             : 
    3024          90 :   if (Idx->getReg() == AMDGPU::NoRegister) {
    3025             :     MachineBasicBlock::iterator I(&MI);
    3026             :     const DebugLoc &DL = MI.getDebugLoc();
    3027             : 
    3028             :     assert(Offset == 0);
    3029             : 
    3030           0 :     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
    3031             :         .add(*SrcVec)
    3032             :         .add(*Val)
    3033           0 :         .addImm(SubReg);
    3034             : 
    3035           0 :     MI.eraseFromParent();
    3036             :     return &MBB;
    3037             :   }
    3038             : 
    3039          90 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
    3040             :     MachineBasicBlock::iterator I(&MI);
    3041             :     const DebugLoc &DL = MI.getDebugLoc();
    3042             : 
    3043          70 :     if (UseGPRIdxMode) {
    3044          42 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3045          14 :           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
    3046             :           .add(*Val)
    3047          14 :           .addReg(Dst, RegState::ImplicitDefine)
    3048          14 :           .addReg(SrcVec->getReg(), RegState::Implicit)
    3049          14 :           .addReg(AMDGPU::M0, RegState::Implicit);
    3050             : 
    3051          28 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3052             :     } else {
    3053          56 :       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3054             : 
    3055         112 :       BuildMI(MBB, I, DL, MovRelDesc)
    3056          56 :           .addReg(Dst, RegState::Define)
    3057          56 :           .addReg(SrcVec->getReg())
    3058             :           .add(*Val)
    3059          56 :           .addImm(SubReg - AMDGPU::sub0);
    3060             :     }
    3061             : 
    3062          70 :     MI.eraseFromParent();
    3063             :     return &MBB;
    3064             :   }
    3065             : 
    3066          20 :   if (Val->isReg())
    3067          20 :     MRI.clearKillFlags(Val->getReg());
    3068             : 
    3069             :   const DebugLoc &DL = MI.getDebugLoc();
    3070             : 
    3071          20 :   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
    3072             : 
    3073             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
    3074          20 :                               Offset, UseGPRIdxMode, false);
    3075          20 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3076             : 
    3077          20 :   if (UseGPRIdxMode) {
    3078          30 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3079          10 :         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
    3080             :         .add(*Val)                               // src0
    3081          10 :         .addReg(Dst, RegState::ImplicitDefine)
    3082          10 :         .addReg(PhiReg, RegState::Implicit)
    3083          10 :         .addReg(AMDGPU::M0, RegState::Implicit);
    3084          20 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3085             :   } else {
    3086          10 :     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3087             : 
    3088          20 :     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
    3089          10 :         .addReg(Dst, RegState::Define)
    3090          10 :         .addReg(PhiReg)
    3091             :         .add(*Val)
    3092          10 :         .addImm(SubReg - AMDGPU::sub0);
    3093             :   }
    3094             : 
    3095          20 :   MI.eraseFromParent();
    3096             : 
    3097          20 :   return LoopBB;
    3098             : }
    3099             : 
    3100       13634 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    3101             :   MachineInstr &MI, MachineBasicBlock *BB) const {
    3102             : 
    3103       13634 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3104       13634 :   MachineFunction *MF = BB->getParent();
    3105       13634 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    3106             : 
    3107       13634 :   if (TII->isMIMG(MI)) {
    3108         625 :     if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
    3109           0 :       report_fatal_error("missing mem operand from MIMG instruction");
    3110             :     }
    3111             :     // Add a memoperand for mimg instructions so that they aren't assumed to
    3112             :     // be ordered memory instuctions.
    3113             : 
    3114             :     return BB;
    3115             :   }
    3116             : 
    3117       13009 :   switch (MI.getOpcode()) {
    3118        2263 :   case AMDGPU::S_ADD_U64_PSEUDO:
    3119             :   case AMDGPU::S_SUB_U64_PSEUDO: {
    3120        2263 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3121             :     const DebugLoc &DL = MI.getDebugLoc();
    3122             : 
    3123        2263 :     MachineOperand &Dest = MI.getOperand(0);
    3124             :     MachineOperand &Src0 = MI.getOperand(1);
    3125             :     MachineOperand &Src1 = MI.getOperand(2);
    3126             : 
    3127        2263 :     unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3128        2263 :     unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3129             : 
    3130             :     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3131             :      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3132        2263 :      &AMDGPU::SReg_32_XM0RegClass);
    3133             :     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3134             :       Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3135        2263 :       &AMDGPU::SReg_32_XM0RegClass);
    3136             : 
    3137             :     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3138             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3139        2263 :       &AMDGPU::SReg_32_XM0RegClass);
    3140             :     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3141             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3142        2263 :       &AMDGPU::SReg_32_XM0RegClass);
    3143             : 
    3144        2263 :     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    3145             : 
    3146        2263 :     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
    3147        2263 :     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
    3148        4526 :     BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
    3149             :       .add(Src0Sub0)
    3150             :       .add(Src1Sub0);
    3151        4526 :     BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
    3152             :       .add(Src0Sub1)
    3153             :       .add(Src1Sub1);
    3154        6789 :     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
    3155        2263 :       .addReg(DestSub0)
    3156             :       .addImm(AMDGPU::sub0)
    3157        2263 :       .addReg(DestSub1)
    3158             :       .addImm(AMDGPU::sub1);
    3159        2263 :     MI.eraseFromParent();
    3160             :     return BB;
    3161             :   }
    3162        8915 :   case AMDGPU::SI_INIT_M0: {
    3163       17830 :     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
    3164       17830 :             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    3165        8915 :         .add(MI.getOperand(0));
    3166        8915 :     MI.eraseFromParent();
    3167        8915 :     return BB;
    3168             :   }
    3169           3 :   case AMDGPU::SI_INIT_EXEC:
    3170             :     // This should be before all vector instructions.
    3171             :     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    3172           3 :             AMDGPU::EXEC)
    3173           3 :         .addImm(MI.getOperand(0).getImm());
    3174           3 :     MI.eraseFromParent();
    3175           3 :     return BB;
    3176             : 
    3177             :   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    3178             :     // Extract the thread count from an SGPR input and set EXEC accordingly.
    3179             :     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    3180             :     //
    3181             :     // S_BFE_U32 count, input, {shift, 7}
    3182             :     // S_BFM_B64 exec, count, 0
    3183             :     // S_CMP_EQ_U32 count, 64
    3184             :     // S_CMOV_B64 exec, -1
    3185             :     MachineInstr *FirstMI = &*BB->begin();
    3186           4 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3187           4 :     unsigned InputReg = MI.getOperand(0).getReg();
    3188           4 :     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3189             :     bool Found = false;
    3190             : 
    3191             :     // Move the COPY of the input reg to the beginning, so that we can use it.
    3192          14 :     for (auto I = BB->begin(); I != &MI; I++) {
    3193          36 :       if (I->getOpcode() != TargetOpcode::COPY ||
    3194          10 :           I->getOperand(0).getReg() != InputReg)
    3195             :         continue;
    3196             : 
    3197           4 :       if (I == FirstMI) {
    3198           0 :         FirstMI = &*++BB->begin();
    3199             :       } else {
    3200           4 :         I->removeFromParent();
    3201             :         BB->insert(FirstMI, &*I);
    3202             :       }
    3203             :       Found = true;
    3204             :       break;
    3205             :     }
    3206             :     assert(Found);
    3207             :     (void)Found;
    3208             : 
    3209             :     // This should be before all vector instructions.
    3210          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
    3211           4 :         .addReg(InputReg)
    3212           4 :         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
    3213          16 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
    3214           4 :             AMDGPU::EXEC)
    3215           4 :         .addReg(CountReg)
    3216             :         .addImm(0);
    3217          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
    3218           4 :         .addReg(CountReg, RegState::Kill)
    3219             :         .addImm(64);
    3220           8 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
    3221           4 :             AMDGPU::EXEC)
    3222             :         .addImm(-1);
    3223           4 :     MI.eraseFromParent();
    3224           4 :     return BB;
    3225             :   }
    3226             : 
    3227             :   case AMDGPU::GET_GROUPSTATICSIZE: {
    3228             :     DebugLoc DL = MI.getDebugLoc();
    3229         122 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
    3230          61 :         .add(MI.getOperand(0))
    3231          61 :         .addImm(MFI->getLDSSize());
    3232          61 :     MI.eraseFromParent();
    3233             :     return BB;
    3234             :   }
    3235          71 :   case AMDGPU::SI_INDIRECT_SRC_V1:
    3236             :   case AMDGPU::SI_INDIRECT_SRC_V2:
    3237             :   case AMDGPU::SI_INDIRECT_SRC_V4:
    3238             :   case AMDGPU::SI_INDIRECT_SRC_V8:
    3239             :   case AMDGPU::SI_INDIRECT_SRC_V16:
    3240          71 :     return emitIndirectSrc(MI, *BB, *getSubtarget());
    3241          90 :   case AMDGPU::SI_INDIRECT_DST_V1:
    3242             :   case AMDGPU::SI_INDIRECT_DST_V2:
    3243             :   case AMDGPU::SI_INDIRECT_DST_V4:
    3244             :   case AMDGPU::SI_INDIRECT_DST_V8:
    3245             :   case AMDGPU::SI_INDIRECT_DST_V16:
    3246          90 :     return emitIndirectDst(MI, *BB, *getSubtarget());
    3247          82 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    3248             :   case AMDGPU::SI_KILL_I1_PSEUDO:
    3249          82 :     return splitKillBlock(MI, BB);
    3250          49 :   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    3251          49 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3252             : 
    3253          49 :     unsigned Dst = MI.getOperand(0).getReg();
    3254          49 :     unsigned Src0 = MI.getOperand(1).getReg();
    3255          49 :     unsigned Src1 = MI.getOperand(2).getReg();
    3256             :     const DebugLoc &DL = MI.getDebugLoc();
    3257          49 :     unsigned SrcCond = MI.getOperand(3).getReg();
    3258             : 
    3259          49 :     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3260          49 :     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3261          49 :     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    3262             : 
    3263         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
    3264          49 :       .addReg(SrcCond);
    3265         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    3266          49 :       .addReg(Src0, 0, AMDGPU::sub0)
    3267          49 :       .addReg(Src1, 0, AMDGPU::sub0)
    3268          49 :       .addReg(SrcCondCopy);
    3269         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    3270          49 :       .addReg(Src0, 0, AMDGPU::sub1)
    3271          49 :       .addReg(Src1, 0, AMDGPU::sub1)
    3272          49 :       .addReg(SrcCondCopy);
    3273             : 
    3274         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
    3275          49 :       .addReg(DstLo)
    3276             :       .addImm(AMDGPU::sub0)
    3277          49 :       .addReg(DstHi)
    3278             :       .addImm(AMDGPU::sub1);
    3279          49 :     MI.eraseFromParent();
    3280          49 :     return BB;
    3281             :   }
    3282          78 :   case AMDGPU::SI_BR_UNDEF: {
    3283          78 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3284             :     const DebugLoc &DL = MI.getDebugLoc();
    3285         234 :     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
    3286          78 :                            .add(MI.getOperand(0));
    3287          78 :     Br->getOperand(1).setIsUndef(true); // read undef SCC
    3288          78 :     MI.eraseFromParent();
    3289          78 :     return BB;
    3290             :   }
    3291         906 :   case AMDGPU::ADJCALLSTACKUP:
    3292             :   case AMDGPU::ADJCALLSTACKDOWN: {
    3293         906 :     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    3294             :     MachineInstrBuilder MIB(*MF, &MI);
    3295             : 
    3296             :     // Add an implicit use of the frame offset reg to prevent the restore copy
    3297             :     // inserted after the call from being reorderd after stack operations in the
    3298             :     // the caller's frame.
    3299         906 :     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
    3300         906 :         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
    3301         906 :         .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
    3302             :     return BB;
    3303             :   }
    3304         487 :   case AMDGPU::SI_CALL_ISEL:
    3305             :   case AMDGPU::SI_TCRETURN_ISEL: {
    3306         487 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3307             :     const DebugLoc &DL = MI.getDebugLoc();
    3308         487 :     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
    3309             : 
    3310         487 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3311         487 :     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
    3312         487 :     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
    3313             :     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
    3314             : 
    3315         487 :     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
    3316             : 
    3317             :     MachineInstrBuilder MIB;
    3318         974 :     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
    3319         906 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
    3320         453 :         .add(MI.getOperand(0))
    3321             :         .addGlobalAddress(G);
    3322             :     } else {
    3323          68 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
    3324          34 :         .add(MI.getOperand(0))
    3325             :         .addGlobalAddress(G);
    3326             : 
    3327             :       // There is an additional imm operand for tcreturn, but it should be in the
    3328             :       // right place already.
    3329             :     }
    3330             : 
    3331        3177 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
    3332        2690 :       MIB.add(MI.getOperand(I));
    3333             : 
    3334         487 :     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3335         487 :     MI.eraseFromParent();
    3336             :     return BB;
    3337             :   }
    3338           0 :   default:
    3339           0 :     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    3340             :   }
    3341             : }
    3342             : 
    3343       24876 : bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
    3344       49752 :   return isTypeLegal(VT.getScalarType());
    3345             : }
    3346             : 
    3347        4366 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    3348             :   // This currently forces unfolding various combinations of fsub into fma with
    3349             :   // free fneg'd operands. As long as we have fast FMA (controlled by
    3350             :   // isFMAFasterThanFMulAndFAdd), we should perform these.
    3351             : 
    3352             :   // When fma is quarter rate, for f64 where add / sub are at best half rate,
    3353             :   // most of these combines appear to be cycle neutral but save on instruction
    3354             :   // count / code size.
    3355        4366 :   return true;
    3356             : }
    3357             : 
    3358       13792 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
    3359             :                                          EVT VT) const {
    3360       13792 :   if (!VT.isVector()) {
    3361       13721 :     return MVT::i1;
    3362             :   }
    3363         142 :   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
    3364             : }
    3365             : 
    3366      130275 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
    3367             :   // TODO: Should i16 be used always if legal? For now it would force VALU
    3368             :   // shifts.
    3369      130275 :   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
    3370             : }
    3371             : 
    3372             : // Answering this is somewhat tricky and depends on the specific device which
    3373             : // have different rates for fma or all f64 operations.
    3374             : //
    3375             : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
    3376             : // regardless of which device (although the number of cycles differs between
    3377             : // devices), so it is always profitable for f64.
    3378             : //
    3379             : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
    3380             : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
    3381             : // which we can always do even without fused FP ops since it returns the same
    3382             : // result as the separate operations and since it is always full
    3383             : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
    3384             : // however does not support denormals, so we do report fma as faster if we have
    3385             : // a fast fma device and require denormals.
    3386             : //
    3387       12154 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    3388       12154 :   VT = VT.getScalarType();
    3389             : 
    3390       12154 :   switch (VT.getSimpleVT().SimpleTy) {
    3391        9235 :   case MVT::f32: {
    3392             :     // This is as fast on some subtargets. However, we always have full rate f32
    3393             :     // mad available which returns the same result as the separate operations
    3394             :     // which we should prefer over fma. We can't use this if we want to support
    3395             :     // denormals, so only report this in these cases.
    3396        9235 :     if (Subtarget->hasFP32Denormals())
    3397         588 :       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
    3398             : 
    3399             :     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
    3400        8647 :     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
    3401             :   }
    3402             :   case MVT::f64:
    3403             :     return true;
    3404        1863 :   case MVT::f16:
    3405        1863 :     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
    3406             :   default:
    3407             :     break;
    3408             :   }
    3409             : 
    3410           0 :   return false;
    3411             : }
    3412             : 
    3413             : //===----------------------------------------------------------------------===//
    3414             : // Custom DAG Lowering Operations
    3415             : //===----------------------------------------------------------------------===//
    3416             : 
    3417             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3418             : // wider vector type is legal.
    3419           6 : SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
    3420             :                                              SelectionDAG &DAG) const {
    3421             :   unsigned Opc = Op.getOpcode();
    3422           6 :   EVT VT = Op.getValueType();
    3423             :   assert(VT == MVT::v4f16);
    3424             : 
    3425             :   SDValue Lo, Hi;
    3426          12 :   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3427             : 
    3428             :   SDLoc SL(Op);
    3429             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
    3430          12 :                              Op->getFlags());
    3431             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
    3432          12 :                              Op->getFlags());
    3433             : 
    3434          18 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3435             : }
    3436             : 
    3437             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3438             : // wider vector type is legal.
    3439          86 : SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
    3440             :                                               SelectionDAG &DAG) const {
    3441             :   unsigned Opc = Op.getOpcode();
    3442          86 :   EVT VT = Op.getValueType();
    3443             :   assert(VT == MVT::v4i16 || VT == MVT::v4f16);
    3444             : 
    3445             :   SDValue Lo0, Hi0;
    3446         172 :   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3447             :   SDValue Lo1, Hi1;
    3448         172 :   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
    3449             : 
    3450             :   SDLoc SL(Op);
    3451             : 
    3452             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
    3453         172 :                              Op->getFlags());
    3454             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
    3455         172 :                              Op->getFlags());
    3456             : 
    3457         258 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3458             : }
    3459             : 
    3460      208057 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    3461      416114 :   switch (Op.getOpcode()) {
    3462       21784 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    3463        1624 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    3464       84548 :   case ISD::LOAD: {
    3465       84548 :     SDValue Result = LowerLOAD(Op, DAG);
    3466             :     assert((!Result.getNode() ||
    3467             :             Result.getNode()->getNumValues() == 2) &&
    3468             :            "Load should return a value and a chain");
    3469       84548 :     return Result;
    3470             :   }
    3471             : 
    3472          51 :   case ISD::FSIN:
    3473             :   case ISD::FCOS:
    3474          51 :     return LowerTrig(Op, DAG);
    3475         654 :   case ISD::SELECT: return LowerSELECT(Op, DAG);
    3476         235 :   case ISD::FDIV: return LowerFDIV(Op, DAG);
    3477         261 :   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
    3478       80302 :   case ISD::STORE: return LowerSTORE(Op, DAG);
    3479         883 :   case ISD::GlobalAddress: {
    3480         883 :     MachineFunction &MF = DAG.getMachineFunction();
    3481         883 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    3482         883 :     return LowerGlobalAddress(MFI, Op, DAG);
    3483             :   }
    3484        6343 :   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    3485        1801 :   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
    3486        2318 :   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
    3487          45 :   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
    3488         143 :   case ISD::INSERT_VECTOR_ELT:
    3489         143 :     return lowerINSERT_VECTOR_ELT(Op, DAG);
    3490        5253 :   case ISD::EXTRACT_VECTOR_ELT:
    3491        5253 :     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
    3492        1205 :   case ISD::BUILD_VECTOR:
    3493        1205 :     return lowerBUILD_VECTOR(Op, DAG);
    3494         479 :   case ISD::FP_ROUND:
    3495         479 :     return lowerFP_ROUND(Op, DAG);
    3496          27 :   case ISD::TRAP:
    3497          27 :     return lowerTRAP(Op, DAG);
    3498           9 :   case ISD::DEBUGTRAP:
    3499           9 :     return lowerDEBUGTRAP(Op, DAG);
    3500           6 :   case ISD::FABS:
    3501             :   case ISD::FNEG:
    3502           6 :     return splitUnaryVectorOp(Op, DAG);
    3503          86 :   case ISD::SHL:
    3504             :   case ISD::SRA:
    3505             :   case ISD::SRL:
    3506             :   case ISD::ADD:
    3507             :   case ISD::SUB:
    3508             :   case ISD::MUL:
    3509             :   case ISD::SMIN:
    3510             :   case ISD::SMAX:
    3511             :   case ISD::UMIN:
    3512             :   case ISD::UMAX:
    3513             :   case ISD::FMINNUM:
    3514             :   case ISD::FMAXNUM:
    3515             :   case ISD::FADD:
    3516             :   case ISD::FMUL:
    3517          86 :     return splitBinaryVectorOp(Op, DAG);
    3518             :   }
    3519             :   return SDValue();
    3520             : }
    3521             : 
    3522             : static unsigned getImageOpcode(unsigned IID) {
    3523             :   switch (IID) {
    3524             :   case Intrinsic::amdgcn_image_load:
    3525             :     return AMDGPUISD::IMAGE_LOAD;
    3526             :   case Intrinsic::amdgcn_image_load_mip:
    3527             :     return AMDGPUISD::IMAGE_LOAD_MIP;
    3528             : 
    3529             :   // Basic sample.
    3530             :   case Intrinsic::amdgcn_image_sample:
    3531             :     return AMDGPUISD::IMAGE_SAMPLE;
    3532             :   case Intrinsic::amdgcn_image_sample_cl:
    3533             :     return AMDGPUISD::IMAGE_SAMPLE_CL;
    3534             :   case Intrinsic::amdgcn_image_sample_d:
    3535             :     return AMDGPUISD::IMAGE_SAMPLE_D;
    3536             :   case Intrinsic::amdgcn_image_sample_d_cl:
    3537             :     return AMDGPUISD::IMAGE_SAMPLE_D_CL;
    3538             :   case Intrinsic::amdgcn_image_sample_l:
    3539             :     return AMDGPUISD::IMAGE_SAMPLE_L;
    3540             :   case Intrinsic::amdgcn_image_sample_b:
    3541             :     return AMDGPUISD::IMAGE_SAMPLE_B;
    3542             :   case Intrinsic::amdgcn_image_sample_b_cl:
    3543             :     return AMDGPUISD::IMAGE_SAMPLE_B_CL;
    3544             :   case Intrinsic::amdgcn_image_sample_lz:
    3545             :     return AMDGPUISD::IMAGE_SAMPLE_LZ;
    3546             :   case Intrinsic::amdgcn_image_sample_cd:
    3547             :     return AMDGPUISD::IMAGE_SAMPLE_CD;
    3548             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    3549             :     return AMDGPUISD::IMAGE_SAMPLE_CD_CL;
    3550             : 
    3551             :   // Sample with comparison.
    3552             :   case Intrinsic::amdgcn_image_sample_c:
    3553             :     return AMDGPUISD::IMAGE_SAMPLE_C;
    3554             :   case Intrinsic::amdgcn_image_sample_c_cl:
    3555             :     return AMDGPUISD::IMAGE_SAMPLE_C_CL;
    3556             :   case Intrinsic::amdgcn_image_sample_c_d:
    3557             :     return AMDGPUISD::IMAGE_SAMPLE_C_D;
    3558             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    3559             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_CL;
    3560             :   case Intrinsic::amdgcn_image_sample_c_l:
    3561             :     return AMDGPUISD::IMAGE_SAMPLE_C_L;
    3562             :   case Intrinsic::amdgcn_image_sample_c_b:
    3563             :     return AMDGPUISD::IMAGE_SAMPLE_C_B;
    3564             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    3565             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_CL;
    3566             :   case Intrinsic::amdgcn_image_sample_c_lz:
    3567             :     return AMDGPUISD::IMAGE_SAMPLE_C_LZ;
    3568             :   case Intrinsic::amdgcn_image_sample_c_cd:
    3569             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD;
    3570             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    3571             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL;
    3572             : 
    3573             :   // Sample with offsets.
    3574             :   case Intrinsic::amdgcn_image_sample_o:
    3575             :     return AMDGPUISD::IMAGE_SAMPLE_O;
    3576             :   case Intrinsic::amdgcn_image_sample_cl_o:
    3577             :     return AMDGPUISD::IMAGE_SAMPLE_CL_O;
    3578             :   case Intrinsic::amdgcn_image_sample_d_o:
    3579             :     return AMDGPUISD::IMAGE_SAMPLE_D_O;
    3580             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    3581             :     return AMDGPUISD::IMAGE_SAMPLE_D_CL_O;
    3582             :   case Intrinsic::amdgcn_image_sample_l_o:
    3583             :     return AMDGPUISD::IMAGE_SAMPLE_L_O;
    3584             :   case Intrinsic::amdgcn_image_sample_b_o:
    3585             :     return AMDGPUISD::IMAGE_SAMPLE_B_O;
    3586             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    3587             :     return AMDGPUISD::IMAGE_SAMPLE_B_CL_O;
    3588             :   case Intrinsic::amdgcn_image_sample_lz_o:
    3589             :     return AMDGPUISD::IMAGE_SAMPLE_LZ_O;
    3590             :   case Intrinsic::amdgcn_image_sample_cd_o:
    3591             :     return AMDGPUISD::IMAGE_SAMPLE_CD_O;
    3592             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    3593             :     return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O;
    3594             : 
    3595             :   // Sample with comparison and offsets.
    3596             :   case Intrinsic::amdgcn_image_sample_c_o:
    3597             :     return AMDGPUISD::IMAGE_SAMPLE_C_O;
    3598             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    3599             :     return AMDGPUISD::IMAGE_SAMPLE_C_CL_O;
    3600             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    3601             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_O;
    3602             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    3603             :     return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O;
    3604             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    3605             :     return AMDGPUISD::IMAGE_SAMPLE_C_L_O;
    3606             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    3607             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_O;
    3608             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    3609             :     return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O;
    3610             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    3611             :     return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O;
    3612             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    3613             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_O;
    3614             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    3615             :     return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O;
    3616             : 
    3617             :   // Basic gather4.
    3618             :   case Intrinsic::amdgcn_image_gather4:
    3619             :     return AMDGPUISD::IMAGE_GATHER4;
    3620             :   case Intrinsic::amdgcn_image_gather4_cl:
    3621             :     return AMDGPUISD::IMAGE_GATHER4_CL;
    3622             :   case Intrinsic::amdgcn_image_gather4_l:
    3623             :     return AMDGPUISD::IMAGE_GATHER4_L;
    3624             :   case Intrinsic::amdgcn_image_gather4_b:
    3625             :     return AMDGPUISD::IMAGE_GATHER4_B;
    3626             :   case Intrinsic::amdgcn_image_gather4_b_cl:
    3627             :     return AMDGPUISD::IMAGE_GATHER4_B_CL;
    3628             :   case Intrinsic::amdgcn_image_gather4_lz:
    3629             :     return AMDGPUISD::IMAGE_GATHER4_LZ;
    3630             : 
    3631             :   // Gather4 with comparison.
    3632             :   case Intrinsic::amdgcn_image_gather4_c:
    3633             :     return AMDGPUISD::IMAGE_GATHER4_C;
    3634             :   case Intrinsic::amdgcn_image_gather4_c_cl:
    3635             :     return AMDGPUISD::IMAGE_GATHER4_C_CL;
    3636             :   case Intrinsic::amdgcn_image_gather4_c_l:
    3637             :     return AMDGPUISD::IMAGE_GATHER4_C_L;
    3638             :   case Intrinsic::amdgcn_image_gather4_c_b:
    3639             :     return AMDGPUISD::IMAGE_GATHER4_C_B;
    3640             :   case Intrinsic::amdgcn_image_gather4_c_b_cl:
    3641             :     return AMDGPUISD::IMAGE_GATHER4_C_B_CL;
    3642             :   case Intrinsic::amdgcn_image_gather4_c_lz:
    3643             :     return AMDGPUISD::IMAGE_GATHER4_C_LZ;
    3644             : 
    3645             :   // Gather4 with offsets.
    3646             :   case Intrinsic::amdgcn_image_gather4_o:
    3647             :     return AMDGPUISD::IMAGE_GATHER4_O;
    3648             :   case Intrinsic::amdgcn_image_gather4_cl_o:
    3649             :     return AMDGPUISD::IMAGE_GATHER4_CL_O;
    3650             :   case Intrinsic::amdgcn_image_gather4_l_o:
    3651             :     return AMDGPUISD::IMAGE_GATHER4_L_O;
    3652             :   case Intrinsic::amdgcn_image_gather4_b_o:
    3653             :     return AMDGPUISD::IMAGE_GATHER4_B_O;
    3654             :   case Intrinsic::amdgcn_image_gather4_b_cl_o:
    3655             :     return AMDGPUISD::IMAGE_GATHER4_B_CL_O;
    3656             :   case Intrinsic::amdgcn_image_gather4_lz_o:
    3657             :     return AMDGPUISD::IMAGE_GATHER4_LZ_O;
    3658             : 
    3659             :   // Gather4 with comparison and offsets.
    3660             :   case Intrinsic::amdgcn_image_gather4_c_o:
    3661             :     return AMDGPUISD::IMAGE_GATHER4_C_O;
    3662             :   case Intrinsic::amdgcn_image_gather4_c_cl_o:
    3663             :     return AMDGPUISD::IMAGE_GATHER4_C_CL_O;
    3664             :   case Intrinsic::amdgcn_image_gather4_c_l_o:
    3665             :     return AMDGPUISD::IMAGE_GATHER4_C_L_O;
    3666             :   case Intrinsic::amdgcn_image_gather4_c_b_o:
    3667             :     return AMDGPUISD::IMAGE_GATHER4_C_B_O;
    3668             :   case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
    3669             :     return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O;
    3670             :   case Intrinsic::amdgcn_image_gather4_c_lz_o:
    3671             :     return AMDGPUISD::IMAGE_GATHER4_C_LZ_O;
    3672             : 
    3673             :   default:
    3674             :     break;
    3675             :   }
    3676             :   return 0;
    3677             : }
    3678             : 
    3679          27 : static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
    3680             :                                        const SDLoc &DL,
    3681             :                                        SelectionDAG &DAG, bool Unpacked) {
    3682          27 :   if (!LoadVT.isVector())
    3683           2 :     return Result;
    3684             : 
    3685          25 :   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
    3686             :     // Truncate to v2i16/v4i16.
    3687          25 :     EVT IntLoadVT = LoadVT.changeTypeToInteger();
    3688             : 
    3689             :     // Workaround legalizer not scalarizing truncate after vector op
    3690             :     // legalization byt not creating intermediate vector trunc.
    3691             :     SmallVector<SDValue, 4> Elts;
    3692          25 :     DAG.ExtractVectorElements(Result, Elts);
    3693         197 :     for (SDValue &Elt : Elts)
    3694          86 :       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
    3695             : 
    3696          25 :     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
    3697             : 
    3698             :     // Bitcast to original type (v2f16/v4f16).
    3699          25 :     return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3700             :   }
    3701             : 
    3702             :   // Cast back to the original packed type.
    3703           0 :   return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3704             : }
    3705             : 
    3706          39 : SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
    3707             :                                               MemSDNode *M,
    3708             :                                               SelectionDAG &DAG,
    3709             :                                               bool IsIntrinsic) const {
    3710             :   SDLoc DL(M);
    3711             :   SmallVector<SDValue, 10> Ops;
    3712          39 :   Ops.reserve(M->getNumOperands());
    3713             : 
    3714          78 :   Ops.push_back(M->getOperand(0));
    3715          39 :   if (IsIntrinsic)
    3716           7 :     Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
    3717             : 
    3718             :   // Skip 1, as it is the intrinsic ID.
    3719         678 :   for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
    3720         600 :     Ops.push_back(M->getOperand(I));
    3721             : 
    3722          39 :   bool Unpacked = Subtarget->hasUnpackedD16VMem();
    3723          78 :   EVT LoadVT = M->getValueType(0);
    3724             : 
    3725          39 :   EVT EquivLoadVT = LoadVT;
    3726          66 :   if (Unpacked && LoadVT.isVector()) {
    3727          25 :     EquivLoadVT = LoadVT.isVector() ?
    3728          25 :       EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    3729          50 :                        LoadVT.getVectorNumElements()) : LoadVT;
    3730             :   }
    3731             : 
    3732             :   // Change from v4f16/v2f16 to EquivLoadVT.
    3733          39 :   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
    3734             : 
    3735             :   SDValue Load
    3736             :     = DAG.getMemIntrinsicNode(
    3737             :       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
    3738             :       VTList, Ops, M->getMemoryVT(),
    3739          78 :       M->getMemOperand());
    3740          39 :   if (!Unpacked) // Just adjusted the opcode.
    3741          12 :     return Load;
    3742             : 
    3743          27 :   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
    3744             : 
    3745          54 :   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
    3746             : }
    3747             : 
    3748         344 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
    3749             :                                           SmallVectorImpl<SDValue> &Results,
    3750             :                                           SelectionDAG &DAG) const {
    3751         688 :   switch (N->getOpcode()) {
    3752             :   case ISD::INSERT_VECTOR_ELT: {
    3753          67 :     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
    3754          23 :       Results.push_back(Res);
    3755          67 :     return;
    3756             :   }
    3757             :   case ISD::EXTRACT_VECTOR_ELT: {
    3758           0 :     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
    3759           0 :       Results.push_back(Res);
    3760           0 :     return;
    3761             :   }
    3762          84 :   case ISD::INTRINSIC_WO_CHAIN: {
    3763         252 :     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    3764             :     switch (IID) {
    3765          28 :     case Intrinsic::amdgcn_cvt_pkrtz: {
    3766          28 :       SDValue Src0 = N->getOperand(1);
    3767          28 :       SDValue Src1 = N->getOperand(2);
    3768             :       SDLoc SL(N);
    3769             :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
    3770          28 :                                 Src0, Src1);
    3771          56 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
    3772             :       return;
    3773             :     }
    3774          56 :     case Intrinsic::amdgcn_cvt_pknorm_i16:
    3775             :     case Intrinsic::amdgcn_cvt_pknorm_u16:
    3776             :     case Intrinsic::amdgcn_cvt_pk_i16:
    3777             :     case Intrinsic::amdgcn_cvt_pk_u16: {
    3778          56 :       SDValue Src0 = N->getOperand(1);
    3779          56 :       SDValue Src1 = N->getOperand(2);
    3780             :       SDLoc SL(N);
    3781             :       unsigned Opcode;
    3782             : 
    3783          56 :       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
    3784             :         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    3785          38 :       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
    3786             :         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    3787          20 :       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
    3788             :         Opcode = AMDGPUISD::CVT_PK_I16_I32;
    3789             :       else
    3790             :         Opcode = AMDGPUISD::CVT_PK_U16_U32;
    3791             : 
    3792          56 :       SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
    3793         112 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
    3794             :       return;
    3795             :     }
    3796             :     }
    3797             :     break;
    3798             :   }
    3799             :   case ISD::INTRINSIC_W_CHAIN: {
    3800           0 :     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
    3801           0 :       Results.push_back(Res);
    3802           0 :       Results.push_back(Res.getValue(1));
    3803           0 :       return;
    3804             :     }
    3805             : 
    3806           0 :     break;
    3807             :   }
    3808             :   case ISD::SELECT: {
    3809             :     SDLoc SL(N);
    3810          52 :     EVT VT = N->getValueType(0);
    3811          26 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    3812          52 :     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
    3813          52 :     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
    3814             : 
    3815          26 :     EVT SelectVT = NewVT;
    3816          26 :     if (NewVT.bitsLT(MVT::i32)) {
    3817           2 :       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
    3818           2 :       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
    3819             :       SelectVT = MVT::i32;
    3820             :     }
    3821             : 
    3822             :     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
    3823          52 :                                     N->getOperand(0), LHS, RHS);
    3824             : 
    3825           0 :     if (NewVT != SelectVT)
    3826           2 :       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
    3827          52 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
    3828             :     return;
    3829             :   }
    3830             :   case ISD::FNEG: {
    3831          11 :     if (N->getValueType(0) != MVT::v2f16)
    3832             :       break;
    3833             : 
    3834             :     SDLoc SL(N);
    3835          20 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3836             : 
    3837             :     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
    3838             :                              BC,
    3839          20 :                              DAG.getConstant(0x80008000, SL, MVT::i32));
    3840          20 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3841             :     return;
    3842             :   }
    3843             :   case ISD::FABS: {
    3844          13 :     if (N->getValueType(0) != MVT::v2f16)
    3845             :       break;
    3846             : 
    3847             :     SDLoc SL(N);
    3848          22 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3849             : 
    3850             :     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
    3851             :                              BC,
    3852          22 :                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
    3853          22 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3854             :     return;
    3855             :   }
    3856             :   default:
    3857             :     break;
    3858             :   }
    3859             : }
    3860             : 
    3861             : /// Helper function for LowerBRCOND
    3862             : static SDNode *findUser(SDValue Value, unsigned Opcode) {
    3863             : 
    3864             :   SDNode *Parent = Value.getNode();
    3865         734 :   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    3866        1539 :        I != E; ++I) {
    3867             : 
    3868         805 :     if (I.getUse().get() != Value)
    3869             :       continue;
    3870             : 
    3871         731 :     if (I->getOpcode() == Opcode)
    3872             :       return *I;
    3873             :   }
    3874             :   return nullptr;
    3875             : }
    3876             : 
    3877        1624 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    3878        1624 :   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    3879        1392 :     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
    3880             :     case Intrinsic::amdgcn_if:
    3881             :       return AMDGPUISD::IF;
    3882          51 :     case Intrinsic::amdgcn_else:
    3883          51 :       return AMDGPUISD::ELSE;
    3884          61 :     case Intrinsic::amdgcn_loop:
    3885          61 :       return AMDGPUISD::LOOP;
    3886           0 :     case Intrinsic::amdgcn_end_cf:
    3887           0 :       llvm_unreachable("should not occur");
    3888           2 :     default:
    3889           2 :       return 0;
    3890             :     }
    3891             :   }
    3892             : 
    3893             :   // break, if_break, else_break are all only used as inputs to loop, not
    3894             :   // directly as branch conditions.
    3895             :   return 0;
    3896             : }
    3897             : 
    3898           4 : void SITargetLowering::createDebuggerPrologueStackObjects(
    3899             :     MachineFunction &MF) const {
    3900             :   // Create stack objects that are used for emitting debugger prologue.
    3901             :   //
    3902             :   // Debugger prologue writes work group IDs and work item IDs to scratch memory
    3903             :   // at fixed location in the following format:
    3904             :   //   offset 0:  work group ID x
    3905             :   //   offset 4:  work group ID y
    3906             :   //   offset 8:  work group ID z
    3907             :   //   offset 16: work item ID x
    3908             :   //   offset 20: work item ID y
    3909             :   //   offset 24: work item ID z
    3910           4 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3911             :   int ObjectIdx = 0;
    3912             : 
    3913             :   // For each dimension:
    3914          28 :   for (unsigned i = 0; i < 3; ++i) {
    3915             :     // Create fixed stack object for work group ID.
    3916          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
    3917             :     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
    3918             :     // Create fixed stack object for work item ID.
    3919          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
    3920             :     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
    3921             :   }
    3922           4 : }
    3923             : 
    3924        1150 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    3925        1150 :   const Triple &TT = getTargetMachine().getTargetTriple();
    3926        1044 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    3927        1256 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    3928        1256 :          AMDGPU::shouldEmitConstantsToTextSection(TT);
    3929             : }
    3930             : 
    3931         598 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    3932         544 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    3933         469 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    3934         129 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    3935         792 :          !shouldEmitFixup(GV) &&
    3936         663 :          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
    3937             : }
    3938             : 
    3939         498 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
    3940         498 :   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
    3941             : }
    3942             : 
    3943             : /// This transforms the control flow intrinsics to get the branch destination as
    3944             : /// last parameter, also switches branch target with BR if the need arise
    3945        1624 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    3946             :                                       SelectionDAG &DAG) const {
    3947             :   SDLoc DL(BRCOND);
    3948             : 
    3949        1624 :   SDNode *Intr = BRCOND.getOperand(1).getNode();
    3950        1624 :   SDValue Target = BRCOND.getOperand(2);
    3951             :   SDNode *BR = nullptr;
    3952             :   SDNode *SetCC = nullptr;
    3953             : 
    3954        1624 :   if (Intr->getOpcode() == ISD::SETCC) {
    3955             :     // As long as we negate the condition everything is fine
    3956             :     SetCC = Intr;
    3957        1291 :     Intr = SetCC->getOperand(0).getNode();
    3958             : 
    3959             :   } else {
    3960             :     // Get the target from BR if we don't negate the condition
    3961             :     BR = findUser(BRCOND, ISD::BR);
    3962         333 :     Target = BR->getOperand(1);
    3963             :   }
    3964             : 
    3965             :   // FIXME: This changes the types of the intrinsics instead of introducing new
    3966             :   // nodes with the correct types.
    3967             :   // e.g. llvm.amdgcn.loop
    3968             : 
    3969             :   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
    3970             :   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
    3971             : 
    3972        1624 :   unsigned CFNode = isCFIntrinsic(Intr);
    3973        1624 :   if (CFNode == 0) {
    3974             :     // This is a uniform branch so we don't need to legalize.
    3975        1162 :     return BRCOND;
    3976             :   }
    3977             : 
    3978         924 :   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
    3979             :                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
    3980             : 
    3981             :   assert(!SetCC ||
    3982             :         (SetCC->getConstantOperandVal(1) == 1 &&
    3983             :          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    3984             :                                                              ISD::SETNE));
    3985             : 
    3986             :   // operands of the new intrinsic call
    3987             :   SmallVector<SDValue, 4> Ops;
    3988         462 :   if (HaveChain)
    3989         462 :     Ops.push_back(BRCOND.getOperand(0));
    3990             : 
    3991         924 :   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
    3992         462 :   Ops.push_back(Target);
    3993             : 
    3994         924 :   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
    3995             : 
    3996             :   // build the new intrinsic call
    3997         462 :   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
    3998             : 
    3999         462 :   if (!HaveChain) {
    4000             :     SDValue Ops[] =  {
    4001             :       SDValue(Result, 0),
    4002             :       BRCOND.getOperand(0)
    4003           0 :     };
    4004             : 
    4005           0 :     Result = DAG.getMergeValues(Ops, DL).getNode();
    4006             :   }
    4007             : 
    4008         462 :   if (BR) {
    4009             :     // Give the branch instruction our target
    4010             :     SDValue Ops[] = {
    4011          93 :       BR->getOperand(0),
    4012             :       BRCOND.getOperand(2)
    4013         186 :     };
    4014         186 :     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
    4015          93 :     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
    4016             :     BR = NewBR.getNode();
    4017             :   }
    4018             : 
    4019         924 :   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    4020             : 
    4021             :   // Copy the intrinsic results to registers
    4022        1325 :   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    4023             :     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    4024         401 :     if (!CopyToReg)
    4025           3 :       continue;
    4026             : 
    4027         398 :     Chain = DAG.getCopyToReg(
    4028             :       Chain, DL,
    4029         398 :       CopyToReg->getOperand(1),
    4030             :       SDValue(Result, i - 1),
    4031         796 :       SDValue());
    4032             : 
    4033         796 :     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    4034             :   }
    4035             : 
    4036             :   // Remove the old intrinsic from the chain
    4037         924 :   DAG.ReplaceAllUsesOfValueWith(
    4038         462 :     SDValue(Intr, Intr->getNumValues() - 1),
    4039         462 :     Intr->getOperand(0));
    4040             : 
    4041         462 :   return Chain;
    4042             : }
    4043             : 
    4044        2417 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
    4045             :                                             SDValue Op,
    4046             :                                             const SDLoc &DL,
    4047             :                                             EVT VT) const {
    4048        7251 :   return Op.getValueType().bitsLE(VT) ?
    4049        2417 :       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
    4050        7251 :       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
    4051             : }
    4052             : 
    4053         479 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
    4054             :   assert(Op.getValueType() == MVT::f16 &&
    4055             :          "Do not know how to custom lower FP_ROUND for non-f16 type");
    4056             : 
    4057         479 :   SDValue Src = Op.getOperand(0);
    4058             :   EVT SrcVT = Src.getValueType();
    4059             :   if (SrcVT != MVT::f64)
    4060         469 :     return Op;
    4061             : 
    4062             :   SDLoc DL(Op);
    4063             : 
    4064          10 :   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
    4065          10 :   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
    4066          10 :   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
    4067             : }
    4068             : 
    4069          27 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
    4070             :   SDLoc SL(Op);
    4071          27 :   SDValue Chain = Op.getOperand(0);
    4072             : 
    4073          39 :   if (Subtarget->getTrapHandlerAbi() != SISubtarget::TrapHandlerAbiHsa ||
    4074          12 :       !Subtarget->isTrapHandlerEnabled())
    4075          21 :     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
    4076             : 
    4077           6 :   MachineFunction &MF = DAG.getMachineFunction();
    4078           6 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4079             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4080             :   assert(UserSGPR != AMDGPU::NoRegister);
    4081             :   SDValue QueuePtr = CreateLiveInRegister(
    4082          12 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4083           6 :   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
    4084             :   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
    4085           6 :                                    QueuePtr, SDValue());
    4086             :   SDValue Ops[] = {
    4087             :     ToReg,
    4088             :     DAG.getTargetConstant(SISubtarget::TrapIDLLVMTrap, SL, MVT::i16),
    4089             :     SGPR01,
    4090             :     ToReg.getValue(1)
    4091          12 :   };
    4092           6 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4093             : }
    4094             : 
    4095           9 : SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
    4096             :   SDLoc SL(Op);
    4097           9 :   SDValue Chain = Op.getOperand(0);
    4098           9 :   MachineFunction &MF = DAG.getMachineFunction();
    4099             : 
    4100          13 :   if (Subtarget->getTrapHandlerAbi() != SISubtarget::TrapHandlerAbiHsa ||
    4101           4 :       !Subtarget->isTrapHandlerEnabled()) {
    4102             :     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
    4103             :                                      "debugtrap handler not supported",
    4104             :                                      Op.getDebugLoc(),
    4105          14 :                                      DS_Warning);
    4106           7 :     LLVMContext &Ctx = MF.getFunction().getContext();
    4107           7 :     Ctx.diagnose(NoTrap);
    4108           7 :     return Chain;
    4109             :   }
    4110             : 
    4111             :   SDValue Ops[] = {
    4112             :     Chain,
    4113             :     DAG.getTargetConstant(SISubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
    4114           4 :   };
    4115           2 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4116             : }
    4117             : 
    4118          32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
    4119             :                                              SelectionDAG &DAG) const {
    4120             :   // FIXME: Use inline constants (src_{shared, private}_base) instead.
    4121          32 :   if (Subtarget->hasApertureRegs()) {
    4122          12 :     unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4123             :         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
    4124             :         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
    4125             :     unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4126             :         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
    4127             :         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
    4128          12 :     unsigned Encoding =
    4129             :         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
    4130          12 :         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
    4131             :         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
    4132             : 
    4133          24 :     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
    4134             :     SDValue ApertureReg = SDValue(
    4135          24 :         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
    4136          12 :     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
    4137          12 :     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
    4138             :   }
    4139             : 
    4140          20 :   MachineFunction &MF = DAG.getMachineFunction();
    4141          20 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4142             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4143             :   assert(UserSGPR != AMDGPU::NoRegister);
    4144             : 
    4145             :   SDValue QueuePtr = CreateLiveInRegister(
    4146          40 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4147             : 
    4148             :   // Offset into amd_queue_t for group_segment_aperture_base_hi /
    4149             :   // private_segment_aperture_base_hi.
    4150          20 :   uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
    4151             : 
    4152          20 :   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
    4153             : 
    4154             :   // TODO: Use custom target PseudoSourceValue.
    4155             :   // TODO: We should use the value from the IR intrinsic call, but it might not
    4156             :   // be available and how do we get it?
    4157          20 :   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
    4158          20 :                                               AMDGPUASI.CONSTANT_ADDRESS));
    4159             : 
    4160             :   MachinePointerInfo PtrInfo(V, StructOffset);
    4161             :   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
    4162             :                      MinAlign(64, StructOffset),
    4163             :                      MachineMemOperand::MODereferenceable |
    4164          40 :                          MachineMemOperand::MOInvariant);
    4165             : }
    4166             : 
    4167          45 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
    4168             :                                              SelectionDAG &DAG) const {
    4169             :   SDLoc SL(Op);
    4170             :   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
    4171             : 
    4172          45 :   SDValue Src = ASC->getOperand(0);
    4173          45 :   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
    4174             : 
    4175             :   const AMDGPUTargetMachine &TM =
    4176          45 :     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
    4177             : 
    4178             :   // flat -> local/private
    4179          45 :   if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4180          12 :     unsigned DestAS = ASC->getDestAddressSpace();
    4181             : 
    4182          17 :     if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
    4183           5 :         DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4184             :       unsigned NullVal = TM.getNullPointerValue(DestAS);
    4185          12 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4186          12 :       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
    4187          12 :       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
    4188             : 
    4189             :       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
    4190          12 :                          NonNull, Ptr, SegmentNullPtr);
    4191             :     }
    4192             :   }
    4193             : 
    4194             :   // local/private -> flat
    4195          33 :   if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4196             :     unsigned SrcAS = ASC->getSrcAddressSpace();
    4197             : 
    4198          54 :     if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
    4199          22 :         SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4200             :       unsigned NullVal = TM.getNullPointerValue(SrcAS);
    4201          32 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4202             : 
    4203             :       SDValue NonNull
    4204          32 :         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
    4205             : 
    4206          32 :       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
    4207             :       SDValue CvtPtr
    4208          32 :         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
    4209             : 
    4210             :       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
    4211             :                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
    4212          64 :                          FlatNullPtr);
    4213             :     }
    4214             :   }
    4215             : 
    4216             :   // global <-> flat are no-ops and never emitted.
    4217             : 
    4218           1 :   const MachineFunction &MF = DAG.getMachineFunction();
    4219             :   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
    4220           2 :     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
    4221           1 :   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
    4222             : 
    4223           2 :   return DAG.getUNDEF(ASC->getValueType(0));
    4224             : }
    4225             : 
    4226         210 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    4227             :                                                  SelectionDAG &DAG) const {
    4228         210 :   SDValue Vec = Op.getOperand(0);
    4229         210 :   SDValue InsVal = Op.getOperand(1);
    4230         210 :   SDValue Idx = Op.getOperand(2);
    4231         210 :   EVT VecVT = Vec.getValueType();
    4232         210 :   EVT EltVT = VecVT.getVectorElementType();
    4233         210 :   unsigned VecSize = VecVT.getSizeInBits();
    4234         210 :   unsigned EltSize = EltVT.getSizeInBits();
    4235             : 
    4236             : 
    4237             :   assert(VecSize <= 64);
    4238             : 
    4239         210 :   unsigned NumElts = VecVT.getVectorNumElements();
    4240             :   SDLoc SL(Op);
    4241             :   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
    4242             : 
    4243         210 :   if (NumElts == 4 && EltSize == 16 && KIdx) {
    4244          20 :     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
    4245             : 
    4246             :     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4247          40 :                                  DAG.getConstant(0, SL, MVT::i32));
    4248             :     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4249          40 :                                  DAG.getConstant(1, SL, MVT::i32));
    4250             : 
    4251          20 :     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
    4252          20 :     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
    4253             : 
    4254          40 :     unsigned Idx = KIdx->getZExtValue();
    4255             :     bool InsertLo = Idx < 2;
    4256             :     SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
    4257          20 :       InsertLo ? LoVec : HiVec,
    4258             :       DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
    4259          60 :       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
    4260             : 
    4261          20 :     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
    4262             : 
    4263             :     SDValue Concat = InsertLo ?
    4264          27 :       DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
    4265          40 :       DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
    4266             : 
    4267          20 :     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
    4268             :   }
    4269             : 
    4270             :   if (isa<ConstantSDNode>(Idx))
    4271         160 :     return SDValue();
    4272             : 
    4273          30 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4274             : 
    4275             :   // Avoid stack access for dynamic indexing.
    4276          30 :   SDValue Val = InsVal;
    4277             :   if (InsVal.getValueType() == MVT::f16)
    4278           6 :       Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
    4279             : 
    4280             :   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
    4281          30 :   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
    4282             : 
    4283             :   assert(isPowerOf2_32(EltSize));
    4284          30 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4285             : 
    4286             :   // Convert vector index to bit-index.
    4287          30 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4288             : 
    4289          30 :   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4290             :   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
    4291             :                             DAG.getConstant(0xffff, SL, IntVT),
    4292          60 :                             ScaledIdx);
    4293             : 
    4294          30 :   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
    4295             :   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
    4296          60 :                             DAG.getNOT(SL, BFM, IntVT), BCVec);
    4297             : 
    4298          30 :   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
    4299          30 :   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
    4300             : }
    4301             : 
    4302        5253 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    4303             :                                                   SelectionDAG &DAG) const {
    4304             :   SDLoc SL(Op);
    4305             : 
    4306        5253 :   EVT ResultVT = Op.getValueType();
    4307        5253 :   SDValue Vec = Op.getOperand(0);
    4308        5253 :   SDValue Idx = Op.getOperand(1);
    4309        5253 :   EVT VecVT = Vec.getValueType();
    4310        5253 :   unsigned VecSize = VecVT.getSizeInBits();
    4311        5253 :   EVT EltVT = VecVT.getVectorElementType();
    4312             :   assert(VecSize <= 64);
    4313             : 
    4314             :   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
    4315             : 
    4316             :   // Make sure we do any optimizations that will make it easier to fold
    4317             :   // source modifiers before obscuring it with bit operations.
    4318             : 
    4319             :   // XXX - Why doesn't this get called when vector_shuffle is expanded?
    4320        5253 :   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
    4321           7 :     return Combined;
    4322             : 
    4323        5246 :   unsigned EltSize = EltVT.getSizeInBits();
    4324             :   assert(isPowerOf2_32(EltSize));
    4325             : 
    4326        5246 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4327        5246 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4328             : 
    4329             :   // Convert vector index to bit-index (* EltSize)
    4330        5246 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4331             : 
    4332        5246 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4333        5246 :   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
    4334             : 
    4335             :   if (ResultVT == MVT::f16) {
    4336        1268 :     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
    4337        1268 :     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    4338             :   }
    4339             : 
    4340        3978 :   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
    4341             : }
    4342             : 
    4343        1205 : SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
    4344             :                                             SelectionDAG &DAG) const {
    4345             :   SDLoc SL(Op);
    4346        1205 :   EVT VT = Op.getValueType();
    4347             : 
    4348             :   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
    4349         652 :     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
    4350             : 
    4351             :     // Turn into pair of packed build_vectors.
    4352             :     // TODO: Special case for constants that can be materialized with s_mov_b64.
    4353             :     SDValue Lo = DAG.getBuildVector(HalfVT, SL,
    4354         652 :                                     { Op.getOperand(0), Op.getOperand(1) });
    4355             :     SDValue Hi = DAG.getBuildVector(HalfVT, SL,
    4356         652 :                                     { Op.getOperand(2), Op.getOperand(3) });
    4357             : 
    4358         326 :     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
    4359         326 :     SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
    4360             : 
    4361         652 :     SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
    4362         326 :     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
    4363             :   }
    4364             : 
    4365             :   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
    4366             : 
    4367         879 :   SDValue Lo = Op.getOperand(0);
    4368         879 :   SDValue Hi = Op.getOperand(1);
    4369             : 
    4370         879 :   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
    4371         879 :   Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
    4372             : 
    4373         879 :   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
    4374         879 :   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
    4375             : 
    4376             :   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
    4377        1758 :                               DAG.getConstant(16, SL, MVT::i32));
    4378             : 
    4379         879 :   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
    4380             : 
    4381         879 :   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
    4382             : }
    4383             : 
    4384             : bool
    4385        1692 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    4386             :   // We can fold offsets for anything that doesn't require a GOT relocation.
    4387        3356 :   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    4388        3256 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    4389        3384 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    4390        1792 :          !shouldEmitGOTReloc(GA->getGlobal());
    4391             : }
    4392             : 
    4393             : static SDValue
    4394         523 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
    4395             :                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
    4396             :                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
    4397             :   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
    4398             :   // lowered to the following code sequence:
    4399             :   //
    4400             :   // For constant address space:
    4401             :   //   s_getpc_b64 s[0:1]
    4402             :   //   s_add_u32 s0, s0, $symbol
    4403             :   //   s_addc_u32 s1, s1, 0
    4404             :   //
    4405             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4406             :   //   a fixup or relocation is emitted to replace $symbol with a literal
    4407             :   //   constant, which is a pc-relative offset from the encoding of the $symbol
    4408             :   //   operand to the global variable.
    4409             :   //
    4410             :   // For global address space:
    4411             :   //   s_getpc_b64 s[0:1]
    4412             :   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
    4413             :   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
    4414             :   //
    4415             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4416             :   //   fixups or relocations are emitted to replace $symbol@*@lo and
    4417             :   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
    4418             :   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
    4419             :   //   operand to the global variable.
    4420             :   //
    4421             :   // What we want here is an offset from the value returned by s_getpc
    4422             :   // (which is the address of the s_add_u32 instruction) to the global
    4423             :   // variable, but since the encoding of $symbol starts 4 bytes after the start
    4424             :   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
    4425             :   // small. This requires us to add 4 to the global variable offset in order to
    4426             :   // compute the correct address.
    4427         523 :   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4428        1569 :                                              GAFlags);
    4429             :   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4430             :                                              GAFlags == SIInstrInfo::MO_NONE ?
    4431        1046 :                                              GAFlags : GAFlags + 1);
    4432         523 :   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
    4433             : }
    4434             : 
    4435         883 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    4436             :                                              SDValue Op,
    4437             :                                              SelectionDAG &DAG) const {
    4438             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    4439         883 :   const GlobalValue *GV = GSD->getGlobal();
    4440             : 
    4441        1738 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
    4442        1710 :       GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
    4443        2567 :       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
    4444             :       // FIXME: It isn't correct to rely on the type of the pointer. This should
    4445             :       // be removed when address space 0 is 64-bit.
    4446         829 :       !GV->getType()->getElementType()->isFunctionTy())
    4447         360 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    4448             : 
    4449             :   SDLoc DL(GSD);
    4450        1046 :   EVT PtrVT = Op.getValueType();
    4451             : 
    4452         523 :   if (shouldEmitFixup(GV))
    4453          25 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    4454         498 :   else if (shouldEmitPCReloc(GV))
    4455         473 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
    4456         473 :                                    SIInstrInfo::MO_REL32);
    4457             : 
    4458             :   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
    4459          25 :                                             SIInstrInfo::MO_GOTPCREL32);
    4460             : 
    4461          25 :   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
    4462          25 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    4463          25 :   const DataLayout &DataLayout = DAG.getDataLayout();
    4464          25 :   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
    4465             :   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
    4466          25 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    4467             : 
    4468             :   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
    4469             :                      MachineMemOperand::MODereferenceable |
    4470          25 :                          MachineMemOperand::MOInvariant);
    4471             : }
    4472             : 
    4473        8919 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
    4474             :                                    const SDLoc &DL, SDValue V) const {
    4475             :   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
    4476             :   // the destination register.
    4477             :   //
    4478             :   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
    4479             :   // so we will end up with redundant moves to m0.
    4480             :   //
    4481             :   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
    4482             : 
    4483             :   // A Null SDValue creates a glue result.
    4484        8919 :   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
    4485        8919 :                                   V, Chain);
    4486        8919 :   return SDValue(M0, 0);
    4487             : }
    4488             : 
    4489          91 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
    4490             :                                                  SDValue Op,
    4491             :                                                  MVT VT,
    4492             :                                                  unsigned Offset) const {
    4493             :   SDLoc SL(Op);
    4494             :   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
    4495         182 :                                            DAG.getEntryNode(), Offset, 4, false);
    4496             :   // The local size values will have the hi 16-bits as zero.
    4497             :   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
    4498         273 :                      DAG.getValueType(VT));
    4499             : }
    4500             : 
    4501           2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4502             :                                         EVT VT) {
    4503           2 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4504             :                                       "non-hsa intrinsic with hsa target",
    4505           6 :                                       DL.getDebugLoc());
    4506           2 :   DAG.getContext()->diagnose(BadIntrin);
    4507           2 :   return DAG.getUNDEF(VT);
    4508             : }
    4509             : 
    4510           5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4511             :                                          EVT VT) {
    4512           5 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4513             :                                       "intrinsic not supported on subtarget",
    4514          15 :                                       DL.getDebugLoc());
    4515           5 :   DAG.getContext()->diagnose(BadIntrin);
    4516           5 :   return DAG.getUNDEF(VT);
    4517             : }
    4518             : 
    4519        6343 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    4520             :                                                   SelectionDAG &DAG) const {
    4521        6343 :   MachineFunction &MF = DAG.getMachineFunction();
    4522        6343 :   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    4523             : 
    4524        6343 :   EVT VT = Op.getValueType();
    4525             :   SDLoc DL(Op);
    4526       12686 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4527             : 
    4528             :   // TODO: Should this propagate fast-math-flags?
    4529             : 
    4530        6343 :   switch (IntrinsicID) {
    4531           4 :   case Intrinsic::amdgcn_implicit_buffer_ptr: {
    4532           4 :     if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
    4533           2 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4534             :     return getPreloadedValue(DAG, *MFI, VT,
    4535           2 :                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
    4536             :   }
    4537          56 :   case Intrinsic::amdgcn_dispatch_ptr:
    4538             :   case Intrinsic::amdgcn_queue_ptr: {
    4539          56 :     if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
    4540             :       DiagnosticInfoUnsupported BadIntrin(
    4541             :           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
    4542           4 :           DL.getDebugLoc());
    4543           2 :       DAG.getContext()->diagnose(BadIntrin);
    4544           2 :       return DAG.getUNDEF(VT);
    4545             :     }
    4546             : 
    4547          54 :     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
    4548             :       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
    4549          54 :     return getPreloadedValue(DAG, *MFI, VT, RegID);
    4550             :   }
    4551          38 :   case Intrinsic::amdgcn_implicitarg_ptr: {
    4552          38 :     if (MFI->isEntryFunction())
    4553          30 :       return getImplicitArgPtr(DAG, DL);
    4554             :     return getPreloadedValue(DAG, *MFI, VT,
    4555           8 :                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    4556             :   }
    4557          22 :   case Intrinsic::amdgcn_kernarg_segment_ptr: {
    4558             :     return getPreloadedValue(DAG, *MFI, VT,
    4559          22 :                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    4560             :   }
    4561           9 :   case Intrinsic::amdgcn_dispatch_id: {
    4562           9 :     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
    4563             :   }
    4564             :   case Intrinsic::amdgcn_rcp:
    4565          20 :     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    4566             :   case Intrinsic::amdgcn_rsq:
    4567          32 :     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4568           5 :   case Intrinsic::amdgcn_rsq_legacy:
    4569           5 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4570           1 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4571             : 
    4572           4 :     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    4573          11 :   case Intrinsic::amdgcn_rcp_legacy:
    4574          11 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4575           4 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4576           7 :     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
    4577           6 :   case Intrinsic::amdgcn_rsq_clamp: {
    4578           6 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4579           3 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    4580             : 
    4581           3 :     Type *Type = VT.getTypeForEVT(*DAG.getContext());
    4582           3 :     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    4583           3 :     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    4584             : 
    4585           3 :     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4586             :     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    4587           3 :                               DAG.getConstantFP(Max, DL, VT));
    4588             :     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    4589           3 :                        DAG.getConstantFP(Min, DL, VT));
    4590             :   }
    4591           2 :   case Intrinsic::r600_read_ngroups_x:
    4592           4 :     if (Subtarget->isAmdHsaOS())
    4593           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4594             : 
    4595             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4596           2 :                                     SI::KernelInputOffsets::NGROUPS_X, 4, false);
    4597           2 :   case Intrinsic::r600_read_ngroups_y:
    4598           4 :     if (Subtarget->isAmdHsaOS())
    4599           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4600             : 
    4601             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4602           2 :                                     SI::KernelInputOffsets::NGROUPS_Y, 4, false);
    4603           2 :   case Intrinsic::r600_read_ngroups_z:
    4604           4 :     if (Subtarget->isAmdHsaOS())
    4605           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4606             : 
    4607             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4608           2 :                                     SI::KernelInputOffsets::NGROUPS_Z, 4, false);
    4609           2 :   case Intrinsic::r600_read_global_size_x:
    4610           4 :     if (Subtarget->isAmdHsaOS())
    4611           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4612             : 
    4613             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4614           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
    4615           2 :   case Intrinsic::r600_read_global_size_y:
    4616           4 :     if (Subtarget->isAmdHsaOS())
    4617           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4618             : 
    4619             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4620           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
    4621           2 :   case Intrinsic::r600_read_global_size_z:
    4622           4 :     if (Subtarget->isAmdHsaOS())
    4623           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4624             : 
    4625             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4626           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
    4627          13 :   case Intrinsic::r600_read_local_size_x:
    4628          26 :     if (Subtarget->isAmdHsaOS())
    4629           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4630             : 
    4631             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4632          13 :                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
    4633          39 :   case Intrinsic::r600_read_local_size_y:
    4634          78 :     if (Subtarget->isAmdHsaOS())
    4635           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4636             : 
    4637             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4638          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
    4639          39 :   case Intrinsic::r600_read_local_size_z:
    4640          78 :     if (Subtarget->isAmdHsaOS())
    4641           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4642             : 
    4643             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4644          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
    4645          49 :   case Intrinsic::amdgcn_workgroup_id_x:
    4646             :   case Intrinsic::r600_read_tgid_x:
    4647             :     return getPreloadedValue(DAG, *MFI, VT,
    4648          49 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    4649          24 :   case Intrinsic::amdgcn_workgroup_id_y:
    4650             :   case Intrinsic::r600_read_tgid_y:
    4651             :     return getPreloadedValue(DAG, *MFI, VT,
    4652          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    4653          24 :   case Intrinsic::amdgcn_workgroup_id_z:
    4654             :   case Intrinsic::r600_read_tgid_z:
    4655             :     return getPreloadedValue(DAG, *MFI, VT,
    4656          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    4657        3015 :   case Intrinsic::amdgcn_workitem_id_x: {
    4658             :   case Intrinsic::r600_read_tidig_x:
    4659             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4660        3015 :                           SDLoc(DAG.getEntryNode()),
    4661        6030 :                           MFI->getArgInfo().WorkItemIDX);
    4662             :   }
    4663         125 :   case Intrinsic::amdgcn_workitem_id_y:
    4664             :   case Intrinsic::r600_read_tidig_y:
    4665             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4666         125 :                           SDLoc(DAG.getEntryNode()),
    4667         250 :                           MFI->getArgInfo().WorkItemIDY);
    4668          74 :   case Intrinsic::amdgcn_workitem_id_z:
    4669             :   case Intrinsic::r600_read_tidig_z:
    4670             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4671          74 :                           SDLoc(DAG.getEntryNode()),
    4672         148 :                           MFI->getArgInfo().WorkItemIDZ);
    4673         446 :   case AMDGPUIntrinsic::SI_load_const: {
    4674             :     SDValue Ops[] = {
    4675             :       Op.getOperand(1),
    4676             :       Op.getOperand(2)
    4677         446 :     };
    4678             : 
    4679         892 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4680             :         MachinePointerInfo(),
    4681             :         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    4682             :             MachineMemOperand::MOInvariant,
    4683         446 :         VT.getStoreSize(), 4);
    4684             :     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
    4685         892 :                                    Op->getVTList(), Ops, VT, MMO);
    4686             :   }
    4687          33 :   case Intrinsic::amdgcn_fdiv_fast:
    4688          33 :     return lowerFDIV_FAST(Op, DAG);
    4689          84 :   case Intrinsic::amdgcn_interp_mov: {
    4690          84 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4691          84 :     SDValue Glue = M0.getValue(1);
    4692             :     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
    4693          84 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4694             :   }
    4695         216 :   case Intrinsic::amdgcn_interp_p1: {
    4696         216 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4697         216 :     SDValue Glue = M0.getValue(1);
    4698             :     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
    4699         216 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4700             :   }
    4701         200 :   case Intrinsic::amdgcn_interp_p2: {
    4702         200 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
    4703         200 :     SDValue Glue = SDValue(M0.getNode(), 1);
    4704             :     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
    4705             :                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
    4706         200 :                        Glue);
    4707             :   }
    4708             :   case Intrinsic::amdgcn_sin:
    4709           5 :     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
    4710             : 
    4711             :   case Intrinsic::amdgcn_cos:
    4712           3 :     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
    4713             : 
    4714           3 :   case Intrinsic::amdgcn_log_clamp: {
    4715           3 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4716           2 :       return SDValue();
    4717             : 
    4718             :     DiagnosticInfoUnsupported BadIntrin(
    4719             :       MF.getFunction(), "intrinsic not supported on subtarget",
    4720           2 :       DL.getDebugLoc());
    4721           1 :       DAG.getContext()->diagnose(BadIntrin);
    4722           1 :       return DAG.getUNDEF(VT);
    4723             :   }
    4724             :   case Intrinsic::amdgcn_ldexp:
    4725             :     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
    4726           9 :                        Op.getOperand(1), Op.getOperand(2));
    4727             : 
    4728             :   case Intrinsic::amdgcn_fract:
    4729           7 :     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    4730             : 
    4731             :   case Intrinsic::amdgcn_class:
    4732             :     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
    4733          61 :                        Op.getOperand(1), Op.getOperand(2));
    4734          10 :   case Intrinsic::amdgcn_div_fmas:
    4735             :     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    4736             :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    4737          10 :                        Op.getOperand(4));
    4738             : 
    4739             :   case Intrinsic::amdgcn_div_fixup:
    4740             :     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    4741          13 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4742             : 
    4743             :   case Intrinsic::amdgcn_trig_preop:
    4744             :     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    4745           4 :                        Op.getOperand(1), Op.getOperand(2));
    4746          27 :   case Intrinsic::amdgcn_div_scale: {
    4747             :     // 3rd parameter required to be a constant.
    4748             :     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4749             :     if (!Param)
    4750           9 :       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
    4751             : 
    4752             :     // Translate to the operands expected by the machine instruction. The
    4753             :     // first parameter must be the same as the first instruction.
    4754          24 :     SDValue Numerator = Op.getOperand(1);
    4755          24 :     SDValue Denominator = Op.getOperand(2);
    4756             : 
    4757             :     // Note this order is opposite of the machine instruction's operations,
    4758             :     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    4759             :     // intrinsic has the numerator as the first operand to match a normal
    4760             :     // division operation.
    4761             : 
    4762          48 :     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    4763             : 
    4764             :     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    4765          48 :                        Denominator, Numerator);
    4766             :   }
    4767          48 :   case Intrinsic::amdgcn_icmp: {
    4768             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4769             :     if (!CD)
    4770           6 :       return DAG.getUNDEF(VT);
    4771             : 
    4772          42 :     int CondCode = CD->getSExtValue();
    4773          42 :     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
    4774             :         CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
    4775           2 :       return DAG.getUNDEF(VT);
    4776             : 
    4777             :     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
    4778          40 :     ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
    4779             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4780          80 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4781             :   }
    4782          56 :   case Intrinsic::amdgcn_fcmp: {
    4783             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4784             :     if (!CD)
    4785           2 :       return DAG.getUNDEF(VT);
    4786             : 
    4787          54 :     int CondCode = CD->getSExtValue();
    4788          54 :     if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
    4789             :         CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
    4790           2 :       return DAG.getUNDEF(VT);
    4791             : 
    4792             :     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
    4793          52 :     ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
    4794             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4795         104 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4796             :   }
    4797             :   case Intrinsic::amdgcn_fmed3:
    4798             :     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
    4799          69 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4800             :   case Intrinsic::amdgcn_fmul_legacy:
    4801             :     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
    4802          31 :                        Op.getOperand(1), Op.getOperand(2));
    4803             :   case Intrinsic::amdgcn_sffbh:
    4804           4 :     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
    4805             :   case Intrinsic::amdgcn_sbfe:
    4806             :     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    4807         102 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4808             :   case Intrinsic::amdgcn_ubfe:
    4809             :     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    4810          94 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4811          95 :   case Intrinsic::amdgcn_cvt_pkrtz:
    4812             :   case Intrinsic::amdgcn_cvt_pknorm_i16:
    4813             :   case Intrinsic::amdgcn_cvt_pknorm_u16:
    4814             :   case Intrinsic::amdgcn_cvt_pk_i16:
    4815             :   case Intrinsic::amdgcn_cvt_pk_u16: {
    4816             :     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
    4817          95 :     EVT VT = Op.getValueType();
    4818             :     unsigned Opcode;
    4819             : 
    4820          95 :     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
    4821             :       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
    4822          56 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
    4823             :       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    4824          38 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
    4825             :       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    4826          20 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
    4827             :       Opcode = AMDGPUISD::CVT_PK_I16_I32;
    4828             :     else
    4829             :       Opcode = AMDGPUISD::CVT_PK_U16_U32;
    4830             : 
    4831             :     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
    4832          95 :                                Op.getOperand(1), Op.getOperand(2));
    4833          95 :     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    4834             :   }
    4835          13 :   case Intrinsic::amdgcn_wqm: {
    4836          13 :     SDValue Src = Op.getOperand(1);
    4837          13 :     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
    4838          13 :                    0);
    4839             :   }
    4840          16 :   case Intrinsic::amdgcn_wwm: {
    4841          16 :     SDValue Src = Op.getOperand(1);
    4842          16 :     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
    4843          16 :                    0);
    4844             :   }
    4845          26 :   case Intrinsic::amdgcn_image_getlod:
    4846             :   case Intrinsic::amdgcn_image_getresinfo: {
    4847          26 :     unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
    4848             : 
    4849             :     // Replace dmask with everything disabled with undef.
    4850             :     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
    4851          52 :     if (!DMask || DMask->isNullValue())
    4852           4 :       return DAG.getUNDEF(Op.getValueType());
    4853          22 :     return SDValue();
    4854             :   }
    4855        1051 :   default:
    4856        1051 :     return Op;
    4857             :   }
    4858             : }
    4859             : 
    4860        1801 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
    4861             :                                                  SelectionDAG &DAG) const {
    4862        3602 :   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    4863             :   SDLoc DL(Op);
    4864             : 
    4865        1801 :   switch (IntrID) {
    4866             :   case Intrinsic::amdgcn_atomic_inc:
    4867             :   case Intrinsic::amdgcn_atomic_dec:
    4868             :   case Intrinsic::amdgcn_ds_fadd:
    4869             :   case Intrinsic::amdgcn_ds_fmin:
    4870             :   case Intrinsic::amdgcn_ds_fmax: {
    4871             :     MemSDNode *M = cast<MemSDNode>(Op);
    4872             :     unsigned Opc;
    4873         245 :     switch (IntrID) {
    4874             :     case Intrinsic::amdgcn_atomic_inc:
    4875             :       Opc = AMDGPUISD::ATOMIC_INC;
    4876             :       break;
    4877         115 :     case Intrinsic::amdgcn_atomic_dec:
    4878             :       Opc = AMDGPUISD::ATOMIC_DEC;
    4879         115 :       break;
    4880           6 :     case Intrinsic::amdgcn_ds_fadd:
    4881             :       Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
    4882           6 :       break;
    4883           6 :     case Intrinsic::amdgcn_ds_fmin:
    4884             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
    4885           6 :       break;
    4886           6 :     case Intrinsic::amdgcn_ds_fmax:
    4887             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
    4888           6 :       break;
    4889           0 :     default:
    4890           0 :       llvm_unreachable("Unknown intrinsic!");
    4891             :     }
    4892             :     SDValue Ops[] = {
    4893         245 :       M->getOperand(0), // Chain
    4894             :       M->getOperand(2), // Ptr
    4895             :       M->getOperand(3)  // Value
    4896         245 :     };
    4897             : 
    4898         245 :     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
    4899         980 :                                    M->getMemoryVT(), M->getMemOperand());
    4900             :   }
    4901         190 :   case Intrinsic::amdgcn_buffer_load:
    4902             :   case Intrinsic::amdgcn_buffer_load_format: {
    4903             :     SDValue Ops[] = {
    4904             :       Op.getOperand(0), // Chain
    4905             :       Op.getOperand(2), // rsrc
    4906             :       Op.getOperand(3), // vindex
    4907             :       Op.getOperand(4), // offset
    4908             :       Op.getOperand(5), // glc
    4909             :       Op.getOperand(6)  // slc
    4910         190 :     };
    4911             : 
    4912         190 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
    4913             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    4914         190 :     EVT VT = Op.getValueType();
    4915         190 :     EVT IntVT = VT.changeTypeToInteger();
    4916             :     auto *M = cast<MemSDNode>(Op);
    4917         190 :     EVT LoadVT = Op.getValueType();
    4918         380 :     bool IsD16 = LoadVT.getScalarType() == MVT::f16;
    4919             :     if (IsD16)
    4920           9 :       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
    4921             : 
    4922             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    4923         543 :                                    M->getMemOperand());
    4924             :   }
    4925             :   case Intrinsic::amdgcn_tbuffer_load: {
    4926             :     MemSDNode *M = cast<MemSDNode>(Op);
    4927          37 :     EVT LoadVT = Op.getValueType();
    4928          74 :     bool IsD16 = LoadVT.getScalarType() == MVT::f16;
    4929             :     if (IsD16) {
    4930           9 :       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
    4931             :     }
    4932             : 
    4933             :     SDValue Ops[] = {
    4934             :       Op.getOperand(0),  // Chain
    4935             :       Op.getOperand(2),  // rsrc
    4936             :       Op.getOperand(3),  // vindex
    4937             :       Op.getOperand(4),  // voffset
    4938             :       Op.getOperand(5),  // soffset
    4939             :       Op.getOperand(6),  // offset
    4940             :       Op.getOperand(7),  // dfmt
    4941             :       Op.getOperand(8),  // nfmt
    4942             :       Op.getOperand(9),  // glc
    4943             :       Op.getOperand(10)   // slc
    4944          28 :     };
    4945             : 
    4946             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    4947             :                                    Op->getVTList(), Ops, LoadVT,
    4948          84 :                                    M->getMemOperand());
    4949             :   }
    4950          34 :   case Intrinsic::amdgcn_buffer_atomic_swap:
    4951             :   case Intrinsic::amdgcn_buffer_atomic_add:
    4952             :   case Intrinsic::amdgcn_buffer_atomic_sub:
    4953             :   case Intrinsic::amdgcn_buffer_atomic_smin:
    4954             :   case Intrinsic::amdgcn_buffer_atomic_umin:
    4955             :   case Intrinsic::amdgcn_buffer_atomic_smax:
    4956             :   case Intrinsic::amdgcn_buffer_atomic_umax:
    4957             :   case Intrinsic::amdgcn_buffer_atomic_and:
    4958             :   case Intrinsic::amdgcn_buffer_atomic_or:
    4959             :   case Intrinsic::amdgcn_buffer_atomic_xor: {
    4960             :     SDValue Ops[] = {
    4961             :       Op.getOperand(0), // Chain
    4962             :       Op.getOperand(2), // vdata
    4963             :       Op.getOperand(3), // rsrc
    4964             :       Op.getOperand(4), // vindex
    4965             :       Op.getOperand(5), // offset
    4966             :       Op.getOperand(6)  // slc
    4967          34 :     };
    4968          34 :     EVT VT = Op.getValueType();
    4969             : 
    4970             :     auto *M = cast<MemSDNode>(Op);
    4971             :     unsigned Opcode = 0;
    4972             : 
    4973          34 :     switch (IntrID) {
    4974             :     case Intrinsic::amdgcn_buffer_atomic_swap:
    4975             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    4976             :       break;
    4977           4 :     case Intrinsic::amdgcn_buffer_atomic_add:
    4978             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    4979           4 :       break;
    4980           2 :     case Intrinsic::amdgcn_buffer_atomic_sub:
    4981             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    4982           2 :       break;
    4983           2 :     case Intrinsic::amdgcn_buffer_atomic_smin:
    4984             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    4985           2 :       break;
    4986           2 :     case Intrinsic::amdgcn_buffer_atomic_umin:
    4987             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    4988           2 :       break;
    4989           2 :     case Intrinsic::amdgcn_buffer_atomic_smax:
    4990             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    4991           2 :       break;
    4992           2 :     case Intrinsic::amdgcn_buffer_atomic_umax:
    4993             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    4994           2 :       break;
    4995           2 :     case Intrinsic::amdgcn_buffer_atomic_and:
    4996             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    4997           2 :       break;
    4998           2 :     case Intrinsic::amdgcn_buffer_atomic_or:
    4999             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    5000           2 :       break;
    5001           2 :     case Intrinsic::amdgcn_buffer_atomic_xor:
    5002             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    5003           2 :       break;
    5004           0 :     default:
    5005           0 :       llvm_unreachable("unhandled atomic opcode");
    5006             :     }
    5007             : 
    5008             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    5009         102 :                                    M->getMemOperand());
    5010             :   }
    5011             : 
    5012          12 :   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
    5013             :     SDValue Ops[] = {
    5014             :       Op.getOperand(0), // Chain
    5015             :       Op.getOperand(2), // src
    5016             :       Op.getOperand(3), // cmp
    5017             :       Op.getOperand(4), // rsrc
    5018             :       Op.getOperand(5), // vindex
    5019             :       Op.getOperand(6), // offset
    5020             :       Op.getOperand(7)  // slc
    5021          12 :     };
    5022          12 :     EVT VT = Op.getValueType();
    5023             :     auto *M = cast<MemSDNode>(Op);
    5024             : 
    5025             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5026          36 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5027             :   }
    5028             : 
    5029          73 :   case Intrinsic::amdgcn_image_load:
    5030             :   case Intrinsic::amdgcn_image_load_mip: {
    5031          73 :     EVT VT = Op.getValueType();
    5032         106 :     if (Subtarget->hasUnpackedD16VMem() &&
    5033         100 :         VT.isVector() && VT.getScalarSizeInBits() == 16) {
    5034             :       return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
    5035           3 :                                  DAG);
    5036             :     }
    5037             : 
    5038          70 :     return SDValue();
    5039             :   }
    5040             : 
    5041             :   // Basic sample.
    5042         747 :   case Intrinsic::amdgcn_image_sample:
    5043             :   case Intrinsic::amdgcn_image_sample_cl:
    5044             :   case Intrinsic::amdgcn_image_sample_d:
    5045             :   case Intrinsic::amdgcn_image_sample_d_cl:
    5046             :   case Intrinsic::amdgcn_image_sample_l:
    5047             :   case Intrinsic::amdgcn_image_sample_b:
    5048             :   case Intrinsic::amdgcn_image_sample_b_cl:
    5049             :   case Intrinsic::amdgcn_image_sample_lz:
    5050             :   case Intrinsic::amdgcn_image_sample_cd:
    5051             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    5052             : 
    5053             :   // Sample with comparison.
    5054             :   case Intrinsic::amdgcn_image_sample_c:
    5055             :   case Intrinsic::amdgcn_image_sample_c_cl:
    5056             :   case Intrinsic::amdgcn_image_sample_c_d:
    5057             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    5058             :   case Intrinsic::amdgcn_image_sample_c_l:
    5059             :   case Intrinsic::amdgcn_image_sample_c_b:
    5060             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    5061             :   case Intrinsic::amdgcn_image_sample_c_lz:
    5062             :   case Intrinsic::amdgcn_image_sample_c_cd:
    5063             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    5064             : 
    5065             :   // Sample with offsets.
    5066             :   case Intrinsic::amdgcn_image_sample_o:
    5067             :   case Intrinsic::amdgcn_image_sample_cl_o:
    5068             :   case Intrinsic::amdgcn_image_sample_d_o:
    5069             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    5070             :   case Intrinsic::amdgcn_image_sample_l_o:
    5071             :   case Intrinsic::amdgcn_image_sample_b_o:
    5072             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    5073             :   case Intrinsic::amdgcn_image_sample_lz_o:
    5074             :   case Intrinsic::amdgcn_image_sample_cd_o:
    5075             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    5076             : 
    5077             :   // Sample with comparison and offsets.
    5078             :   case Intrinsic::amdgcn_image_sample_c_o:
    5079             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    5080             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    5081             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    5082             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    5083             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    5084             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    5085             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    5086             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    5087             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    5088             : 
    5089             :     // Basic gather4
    5090             :   case Intrinsic::amdgcn_image_gather4:
    5091             :   case Intrinsic::amdgcn_image_gather4_cl:
    5092             :   case Intrinsic::amdgcn_image_gather4_l:
    5093             :   case Intrinsic::amdgcn_image_gather4_b:
    5094             :   case Intrinsic::amdgcn_image_gather4_b_cl:
    5095             :   case Intrinsic::amdgcn_image_gather4_lz:
    5096             : 
    5097             :   // Gather4 with comparison
    5098             :   case Intrinsic::amdgcn_image_gather4_c:
    5099             :   case Intrinsic::amdgcn_image_gather4_c_cl:
    5100             :   case Intrinsic::amdgcn_image_gather4_c_l:
    5101             :   case Intrinsic::amdgcn_image_gather4_c_b:
    5102             :   case Intrinsic::amdgcn_image_gather4_c_b_cl:
    5103             :   case Intrinsic::amdgcn_image_gather4_c_lz:
    5104             : 
    5105             :   // Gather4 with offsets
    5106             :   case Intrinsic::amdgcn_image_gather4_o:
    5107             :   case Intrinsic::amdgcn_image_gather4_cl_o:
    5108             :   case Intrinsic::amdgcn_image_gather4_l_o:
    5109             :   case Intrinsic::amdgcn_image_gather4_b_o:
    5110             :   case Intrinsic::amdgcn_image_gather4_b_cl_o:
    5111             :   case Intrinsic::amdgcn_image_gather4_lz_o:
    5112             : 
    5113             :   // Gather4 with comparison and offsets
    5114             :   case Intrinsic::amdgcn_image_gather4_c_o:
    5115             :   case Intrinsic::amdgcn_image_gather4_c_cl_o:
    5116             :   case Intrinsic::amdgcn_image_gather4_c_l_o:
    5117             :   case Intrinsic::amdgcn_image_gather4_c_b_o:
    5118             :   case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
    5119             :   case Intrinsic::amdgcn_image_gather4_c_lz_o: {
    5120             :     // Replace dmask with everything disabled with undef.
    5121             :     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
    5122        1490 :     if (!DMask || DMask->isNullValue()) {
    5123          62 :       SDValue Undef = DAG.getUNDEF(Op.getValueType());
    5124         248 :       return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
    5125             :     }
    5126             : 
    5127        1004 :     if (Subtarget->hasUnpackedD16VMem() &&
    5128        2004 :         Op.getValueType().isVector() &&
    5129        1000 :         Op.getValueType().getScalarSizeInBits() == 16) {
    5130             :       return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
    5131          11 :                                  DAG);
    5132             :     }
    5133             : 
    5134         674 :     return SDValue();
    5135             :   }
    5136         463 :   default:
    5137         580 :     if (Subtarget->hasUnpackedD16VMem() &&
    5138        1100 :         Op.getValueType().isVector() &&
    5139         520 :         Op.getValueType().getScalarSizeInBits() == 16) {
    5140           7 :       if (const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
    5141           7 :             AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID)) {
    5142           7 :         return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
    5143           7 :                                    cast<MemSDNode>(Op), DAG, true);
    5144             :       }
    5145             :     }
    5146             : 
    5147         456 :     return SDValue();
    5148             :   }
    5149             : }
    5150             : 
    5151          24 : SDValue SITargetLowering::handleD16VData(SDValue VData,
    5152             :                                          SelectionDAG &DAG) const {
    5153          24 :   EVT StoreVT = VData.getValueType();
    5154             : 
    5155             :   // No change for f16 and legal vector D16 types.
    5156          24 :   if (!StoreVT.isVector())
    5157           6 :     return VData;
    5158             : 
    5159             :   SDLoc DL(VData);
    5160             :   assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
    5161             : 
    5162          18 :   if (Subtarget->hasUnpackedD16VMem()) {
    5163             :     // We need to unpack the packed data to store.
    5164          10 :     EVT IntStoreVT = StoreVT.changeTypeToInteger();
    5165          10 :     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
    5166             : 
    5167          10 :     EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    5168          20 :                                         StoreVT.getVectorNumElements());
    5169          10 :     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
    5170          10 :     return DAG.UnrollVectorOp(ZExt.getNode());
    5171             :   }
    5172             : 
    5173             :   assert(isTypeLegal(StoreVT));
    5174           8 :   return VData;
    5175             : }
    5176             : 
    5177        2318 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
    5178             :                                               SelectionDAG &DAG) const {
    5179             :   SDLoc DL(Op);
    5180        2318 :   SDValue Chain = Op.getOperand(0);
    5181        4636 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    5182        2318 :   MachineFunction &MF = DAG.getMachineFunction();
    5183             : 
    5184        2318 :   switch (IntrinsicID) {
    5185         355 :   case Intrinsic::amdgcn_exp: {
    5186             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5187             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5188             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
    5189             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
    5190             : 
    5191             :     const SDValue Ops[] = {
    5192             :       Chain,
    5193         355 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5194         355 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5195             :       Op.getOperand(4), // src0
    5196             :       Op.getOperand(5), // src1
    5197             :       Op.getOperand(6), // src2
    5198             :       Op.getOperand(7), // src3
    5199             :       DAG.getTargetConstant(0, DL, MVT::i1), // compr
    5200         355 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5201        2130 :     };
    5202             : 
    5203         710 :     unsigned Opc = Done->isNullValue() ?
    5204             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5205         710 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5206             :   }
    5207          96 :   case Intrinsic::amdgcn_exp_compr: {
    5208             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5209             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5210          96 :     SDValue Src0 = Op.getOperand(4);
    5211          96 :     SDValue Src1 = Op.getOperand(5);
    5212             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
    5213             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
    5214             : 
    5215          96 :     SDValue Undef = DAG.getUNDEF(MVT::f32);
    5216             :     const SDValue Ops[] = {
    5217             :       Chain,
    5218          96 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5219          96 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5220          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
    5221          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
    5222             :       Undef, // src2
    5223             :       Undef, // src3
    5224             :       DAG.getTargetConstant(1, DL, MVT::i1), // compr
    5225          96 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5226         768 :     };
    5227             : 
    5228         192 :     unsigned Opc = Done->isNullValue() ?
    5229             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5230         192 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5231             :   }
    5232          26 :   case Intrinsic::amdgcn_s_sendmsg:
    5233             :   case Intrinsic::amdgcn_s_sendmsghalt: {
    5234          26 :     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
    5235             :       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
    5236          26 :     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    5237          26 :     SDValue Glue = Chain.getValue(1);
    5238             :     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
    5239          26 :                        Op.getOperand(2), Glue);
    5240             :   }
    5241             :   case Intrinsic::amdgcn_init_exec: {
    5242             :     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
    5243           3 :                        Op.getOperand(2));
    5244             :   }
    5245             :   case Intrinsic::amdgcn_init_exec_from_input: {
    5246             :     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
    5247           4 :                        Op.getOperand(2), Op.getOperand(3));
    5248             :   }
    5249          31 :   case AMDGPUIntrinsic::AMDGPU_kill: {
    5250          31 :     SDValue Src = Op.getOperand(2);
    5251             :     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
    5252          22 :       if (!K->isNegative())
    5253           4 :         return Chain;
    5254             : 
    5255           7 :       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
    5256           7 :       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
    5257             :     }
    5258             : 
    5259          20 :     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
    5260          20 :     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
    5261             :   }
    5262         149 :   case Intrinsic::amdgcn_s_barrier: {
    5263         149 :     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
    5264         141 :       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    5265         141 :       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
    5266         141 :       if (WGSize <= ST.getWavefrontSize())
    5267          10 :         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
    5268           5 :                                           Op.getOperand(0)), 0);
    5269             :     }
    5270         144 :     return SDValue();
    5271             :   };
    5272          14 :   case AMDGPUIntrinsic::SI_tbuffer_store: {
    5273             : 
    5274             :     // Extract vindex and voffset from vaddr as appropriate
    5275             :     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
    5276             :     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
    5277          14 :     SDValue VAddr = Op.getOperand(5);
    5278             : 
    5279          14 :     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
    5280             : 
    5281             :     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
    5282             :            "Legacy intrinsic doesn't support both offset and index - use new version");
    5283             : 
    5284          28 :     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
    5285          28 :     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
    5286             : 
    5287             :     // Deal with the vec-3 case
    5288             :     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
    5289          28 :     auto Opcode = NumChannels->getZExtValue() == 3 ?
    5290             :       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
    5291             : 
    5292             :     SDValue Ops[] = {
    5293             :      Chain,
    5294             :      Op.getOperand(3),  // vdata
    5295             :      Op.getOperand(2),  // rsrc
    5296             :      VIndex,
    5297             :      VOffset,
    5298             :      Op.getOperand(6),  // soffset
    5299             :      Op.getOperand(7),  // inst_offset
    5300             :      Op.getOperand(8),  // dfmt
    5301             :      Op.getOperand(9),  // nfmt
    5302             :      Op.getOperand(12), // glc
    5303             :      Op.getOperand(13), // slc
    5304          14 :     };
    5305             : 
    5306             :     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
    5307             :            "Value of tfe other than zero is unsupported");
    5308             : 
    5309          28 :     EVT VT = Op.getOperand(3).getValueType();
    5310          28 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    5311             :       MachinePointerInfo(),
    5312             :       MachineMemOperand::MOStore,
    5313          14 :       VT.getStoreSize(), 4);
    5314             :     return DAG.getMemIntrinsicNode(Opcode, DL,
    5315          28 :                                    Op->getVTList(), Ops, VT, MMO);
    5316             :   }
    5317             : 
    5318          41 :   case Intrinsic::amdgcn_tbuffer_store: {
    5319          41 :     SDValue VData = Op.getOperand(2);
    5320          82 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5321          41 :     if (IsD16)
    5322           9 :       VData = handleD16VData(VData, DAG);
    5323             :     SDValue Ops[] = {
    5324             :       Chain,
    5325             :       VData,             // vdata
    5326             :       Op.getOperand(3),  // rsrc
    5327             :       Op.getOperand(4),  // vindex
    5328             :       Op.getOperand(5),  // voffset
    5329             :       Op.getOperand(6),  // soffset
    5330             :       Op.getOperand(7),  // offset
    5331             :       Op.getOperand(8),  // dfmt
    5332             :       Op.getOperand(9),  // nfmt
    5333             :       Op.getOperand(10), // glc
    5334             :       Op.getOperand(11)  // slc
    5335          82 :     };
    5336          41 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5337             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5338             :     MemSDNode *M = cast<MemSDNode>(Op);
    5339             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5340         123 :                                    M->getMemoryVT(), M->getMemOperand());
    5341             :   }
    5342             : 
    5343         153 :   case Intrinsic::amdgcn_buffer_store:
    5344             :   case Intrinsic::amdgcn_buffer_store_format: {
    5345         153 :     SDValue VData = Op.getOperand(2);
    5346         306 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5347         153 :     if (IsD16)
    5348           9 :       VData = handleD16VData(VData, DAG);
    5349             :     SDValue Ops[] = {
    5350             :       Chain,
    5351             :       VData,            // vdata
    5352             :       Op.getOperand(3), // rsrc
    5353             :       Op.getOperand(4), // vindex
    5354             :       Op.getOperand(5), // offset
    5355             :       Op.getOperand(6), // glc
    5356             :       Op.getOperand(7)  // slc
    5357         306 :     };
    5358         153 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
    5359             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5360         153 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5361             :     MemSDNode *M = cast<MemSDNode>(Op);
    5362             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5363         459 :                                    M->getMemoryVT(), M->getMemOperand());
    5364             :   }
    5365          77 :   case Intrinsic::amdgcn_image_store:
    5366             :   case Intrinsic::amdgcn_image_store_mip: {
    5367          77 :     SDValue VData = Op.getOperand(2);
    5368          77 :     EVT VT = VData.getValueType();
    5369         112 :     if (Subtarget->hasUnpackedD16VMem() &&
    5370         108 :         VT.isVector() && VT.getScalarSizeInBits() == 16) {
    5371           3 :       SDValue Chain = Op.getOperand(0);
    5372             : 
    5373           3 :       VData = handleD16VData(VData, DAG);
    5374             :       SDValue Ops[] = {
    5375             :         Chain, // Chain
    5376             :         VData, // vdata
    5377             :         Op.getOperand(3), // vaddr
    5378             :         Op.getOperand(4), // rsrc
    5379             :         Op.getOperand(5), // dmask
    5380             :         Op.getOperand(6), // glc
    5381             :         Op.getOperand(7), // slc
    5382             :         Op.getOperand(8), // lwe
    5383             :         Op.getOperand(9)  // da
    5384           6 :       };
    5385           3 :       unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ?
    5386             :         AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
    5387             :       MemSDNode *M = cast<MemSDNode>(Op);
    5388             :       return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5389           9 :                                      M->getMemoryVT(), M->getMemOperand());
    5390             :     }
    5391             : 
    5392          74 :     return SDValue();
    5393             :   }
    5394        1369 :   default: {
    5395             :     const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
    5396        1369 :         AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
    5397        1369 :     if (D16ImageDimIntr) {
    5398         135 :       SDValue VData = Op.getOperand(2);
    5399         135 :       EVT StoreVT = VData.getValueType();
    5400         178 :       if (Subtarget->hasUnpackedD16VMem() &&
    5401         174 :           StoreVT.isVector() &&
    5402             :           StoreVT.getScalarSizeInBits() == 16) {
    5403             :         SmallVector<SDValue, 12> Ops(Op.getNode()->op_values());
    5404             : 
    5405           6 :         Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
    5406           6 :         Ops[2] = handleD16VData(VData, DAG);
    5407             : 
    5408             :         MemSDNode *M = cast<MemSDNode>(Op);
    5409             :         return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
    5410             :                                        Ops, M->getMemoryVT(),
    5411           9 :                                        M->getMemOperand());
    5412             :       }
    5413             :     }
    5414             : 
    5415        1366 :     return Op;
    5416             :   }
    5417             :   }
    5418             : }
    5419             : 
    5420         478 : static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
    5421             :                                  ISD::LoadExtType ExtType, SDValue Op,
    5422             :                                  const SDLoc &SL, EVT VT) {
    5423         478 :   if (VT.bitsLT(Op.getValueType()))
    5424         143 :     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
    5425             : 
    5426         335 :   switch (ExtType) {
    5427             :   case ISD::SEXTLOAD:
    5428          25 :     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
    5429             :   case ISD::ZEXTLOAD:
    5430         236 :     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
    5431             :   case ISD::EXTLOAD:
    5432          74 :     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
    5433           0 :   case ISD::NON_EXTLOAD:
    5434           0 :     return Op;
    5435             :   }
    5436             : 
    5437           0 :   llvm_unreachable("invalid ext type");
    5438             : }
    5439             : 
    5440      231004 : SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
    5441      231004 :   SelectionDAG &DAG = DCI.DAG;
    5442      668679 :   if (Ld->getAlignment() < 4 || Ld->isDivergent())
    5443       73466 :     return SDValue();
    5444             : 
    5445             :   // FIXME: Constant loads should all be marked invariant.
    5446             :   unsigned AS = Ld->getAddressSpace();
    5447      315076 :   if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
    5448      315076 :       AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
    5449       27026 :       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
    5450       48456 :     return SDValue();
    5451             : 
    5452             :   // Don't do this early, since it may interfere with adjacent load merging for
    5453             :   // illegal types. We can avoid losing alignment information for exotic types
    5454             :   // pre-legalize.
    5455      109082 :   EVT MemVT = Ld->getMemoryVT();
    5456      152570 :   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
    5457       43488 :       MemVT.getSizeInBits() >= 32)
    5458      108604 :     return SDValue();
    5459             : 
    5460             :   SDLoc SL(Ld);
    5461             : 
    5462             :   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
    5463             :          "unexpected vector extload");
    5464             : 
    5465             :   // TODO: Drop only high part of range.
    5466         478 :   SDValue Ptr = Ld->getBasePtr();
    5467             :   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
    5468             :                                 MVT::i32, SL, Ld->getChain(), Ptr,
    5469             :                                 Ld->getOffset(),
    5470         478 :                                 Ld->getPointerInfo(), MVT::i32,
    5471             :                                 Ld->getAlignment(),
    5472         478 :                                 Ld->getMemOperand()->getFlags(),
    5473         956 :                                 Ld->getAAInfo(),
    5474         956 :                                 nullptr); // Drop ranges
    5475             : 
    5476         478 :   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
    5477         478 :   if (MemVT.isFloatingPoint()) {
    5478             :     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
    5479             :            "unexpected fp extload");
    5480           0 :     TruncVT = MemVT.changeTypeToInteger();
    5481             :   }
    5482             : 
    5483         478 :   SDValue Cvt = NewLoad;
    5484         478 :   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
    5485          28 :     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
    5486          56 :                       DAG.getValueType(TruncVT));
    5487         450 :   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
    5488             :              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
    5489         364 :     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
    5490             :   } else {
    5491             :     assert(Ld->getExtensionType() == ISD::EXTLOAD);
    5492             :   }
    5493             : 
    5494         956 :   EVT VT = Ld->getValueType(0);
    5495         478 :   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
    5496             : 
    5497         478 :   DCI.AddToWorklist(Cvt.getNode());
    5498             : 
    5499             :   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
    5500             :   // the appropriate extension from the 32-bit load.
    5501         478 :   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
    5502         478 :   DCI.AddToWorklist(Cvt.getNode());
    5503             : 
    5504             :   // Handle conversion back to floating point if necessary.
    5505         478 :   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
    5506             : 
    5507         956 :   return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
    5508             : }
    5509             : 
    5510       84548 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    5511             :   SDLoc DL(Op);
    5512             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    5513             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    5514       84548 :   EVT MemVT = Load->getMemoryVT();
    5515             : 
    5516       84548 :   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
    5517             :     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
    5518        2408 :       return SDValue();
    5519             : 
    5520             :     // FIXME: Copied from PPC
    5521             :     // First, load into 32 bits, then truncate to 1 bit.
    5522             : 
    5523         147 :     SDValue Chain = Load->getChain();
    5524         147 :     SDValue BasePtr = Load->getBasePtr();
    5525         147 :     MachineMemOperand *MMO = Load->getMemOperand();
    5526             : 
    5527             :     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
    5528             : 
    5529             :     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
    5530         147 :                                    BasePtr, RealMemVT, MMO);
    5531             : 
    5532             :     SDValue Ops[] = {
    5533         147 :       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
    5534             :       NewLD.getValue(1)
    5535         294 :     };
    5536             : 
    5537         147 :     return DAG.getMergeValues(Ops, DL);
    5538             :   }
    5539             : 
    5540       81993 :   if (!MemVT.isVector())
    5541           0 :     return SDValue();
    5542             : 
    5543             :   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
    5544             :          "Custom lowering for non-i32 vectors hasn't been implemented.");
    5545             : 
    5546       81993 :   unsigned Alignment = Load->getAlignment();
    5547             :   unsigned AS = Load->getAddressSpace();
    5548      163986 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
    5549             :                           AS, Alignment)) {
    5550           2 :     SDValue Ops[2];
    5551           4 :     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
    5552           2 :     return DAG.getMergeValues(Ops, DL);
    5553             :   }
    5554             : 
    5555       81991 :   MachineFunction &MF = DAG.getMachineFunction();
    5556       81991 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5557             :   // If there is a possibilty that flat instruction access scratch memory
    5558             :   // then we need to use the same legalization rules we use for private.
    5559       81991 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5560          29 :     AS = MFI->hasFlatScratchInit() ?
    5561             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5562             : 
    5563       81991 :   unsigned NumElements = MemVT.getVectorNumElements();
    5564             : 
    5565      163982 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5566       81991 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
    5567       58003 :     if (!Op->isDivergent() && Alignment >= 4)
    5568       57752 :       return SDValue();
    5569             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5570             :     // have the same legalization requirements as global and private
    5571             :     // loads.
    5572             :     //
    5573             :   }
    5574             : 
    5575       24239 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5576       23988 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5577             :       AS == AMDGPUASI.GLOBAL_ADDRESS) {
    5578       22924 :     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
    5579       15367 :         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
    5580             :         Alignment >= 4)
    5581         792 :       return SDValue();
    5582             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5583             :     // have the same legalization requirements as global and private
    5584             :     // loads.
    5585             :     //
    5586             :   }
    5587       23447 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5588       23196 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5589       10832 :       AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5590       10832 :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5591       12615 :     if (NumElements > 4)
    5592        1205 :       return SplitVectorLoad(Op, DAG);
    5593             :     // v4 loads are supported for private and global memory.
    5594       11410 :     return SDValue();
    5595             :   }
    5596       10832 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    5597             :     // Depending on the setting of the private_element_size field in the
    5598             :     // resource descriptor, we can only make private accesses up to a certain
    5599             :     // size.
    5600         384 :     switch (Subtarget->getMaxPrivateElementSize()) {
    5601         217 :     case 4:
    5602         217 :       return scalarizeVectorLoad(Load, DAG);
    5603          53 :     case 8:
    5604          53 :       if (NumElements > 2)
    5605           5 :         return SplitVectorLoad(Op, DAG);
    5606          48 :       return SDValue();
    5607         114 :     case 16:
    5608             :       // Same as global/flat
    5609         114 :       if (NumElements > 4)
    5610           1 :         return SplitVectorLoad(Op, DAG);
    5611         113 :       return SDValue();
    5612           0 :     default:
    5613           0 :       llvm_unreachable("unsupported private_element_size");
    5614             :     }
    5615       10448 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    5616             :     // Use ds_read_b128 if possible.
    5617       14206 :     if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
    5618             :         MemVT.getStoreSize() == 16)
    5619         996 :       return SDValue();
    5620             : 
    5621        9452 :     if (NumElements > 2)
    5622        1156 :       return SplitVectorLoad(Op, DAG);
    5623             :   }
    5624        8296 :   return SDValue();
    5625             : }
    5626             : 
    5627         654 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    5628         654 :   EVT VT = Op.getValueType();
    5629             :   assert(VT.getSizeInBits() == 64);
    5630             : 
    5631             :   SDLoc DL(Op);
    5632         654 :   SDValue Cond = Op.getOperand(0);
    5633             : 
    5634         654 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    5635         654 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    5636             : 
    5637         654 :   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
    5638         654 :   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
    5639             : 
    5640         654 :   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
    5641         654 :   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
    5642             : 
    5643         654 :   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
    5644             : 
    5645         654 :   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
    5646         654 :   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
    5647             : 
    5648         654 :   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
    5649             : 
    5650        1308 :   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
    5651        1308 :   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
    5652             : }
    5653             : 
    5654             : // Catch division cases where we can use shortcuts with rcp and rsq
    5655             : // instructions.
    5656         174 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
    5657             :                                               SelectionDAG &DAG) const {
    5658             :   SDLoc SL(Op);
    5659         174 :   SDValue LHS = Op.getOperand(0);
    5660         174 :   SDValue RHS = Op.getOperand(1);
    5661         174 :   EVT VT = Op.getValueType();
    5662         174 :   const SDNodeFlags Flags = Op->getFlags();
    5663         174 :   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
    5664             : 
    5665         137 :   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
    5666          16 :     return SDValue();
    5667             : 
    5668             :   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    5669          98 :     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    5670         196 :       if (CLHS->isExactlyValue(1.0)) {
    5671             :         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
    5672             :         // the CI documentation has a worst case error of 1 ulp.
    5673             :         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
    5674             :         // use it as long as we aren't trying to use denormals.
    5675             :         //
    5676             :         // v_rcp_f16 and v_rsq_f16 DO support denormals.
    5677             : 
    5678             :         // 1.0 / sqrt(x) -> rsq(x)
    5679             : 
    5680             :         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
    5681             :         // error seems really high at 2^29 ULP.
    5682          65 :         if (RHS.getOpcode() == ISD::FSQRT)
    5683           7 :           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
    5684             : 
    5685             :         // 1.0 / x -> rcp(x)
    5686          58 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5687             :       }
    5688             : 
    5689             :       // Same as for 1.0, but expand the sign out of the constant.
    5690          66 :       if (CLHS->isExactlyValue(-1.0)) {
    5691             :         // -1.0 / x -> rcp (fneg x)
    5692          33 :         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    5693          33 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    5694             :       }
    5695             :     }
    5696             :   }
    5697             : 
    5698          60 :   if (Unsafe) {
    5699             :     // Turn into multiply by the reciprocal.
    5700             :     // x / y -> x * (1.0 / y)
    5701          12 :     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5702          12 :     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
    5703             :   }
    5704             : 
    5705          48 :   return SDValue();
    5706             : }
    5707             : 
    5708          61 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5709             :                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
    5710          61 :   if (GlueChain->getNumValues() <= 1) {
    5711          16 :     return DAG.getNode(Opcode, SL, VT, A, B);
    5712             :   }
    5713             : 
    5714             :   assert(GlueChain->getNumValues() == 3);
    5715             : 
    5716          45 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5717          45 :   switch (Opcode) {
    5718           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5719          45 :   case ISD::FMUL:
    5720             :     Opcode = AMDGPUISD::FMUL_W_CHAIN;
    5721             :     break;
    5722             :   }
    5723             : 
    5724             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
    5725          45 :                      GlueChain.getValue(2));
    5726             : }
    5727             : 
    5728         305 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5729             :                            EVT VT, SDValue A, SDValue B, SDValue C,
    5730             :                            SDValue GlueChain) {
    5731         305 :   if (GlueChain->getNumValues() <= 1) {
    5732          80 :     return DAG.getNode(Opcode, SL, VT, A, B, C);
    5733             :   }
    5734             : 
    5735             :   assert(GlueChain->getNumValues() == 3);
    5736             : 
    5737         225 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5738         225 :   switch (Opcode) {
    5739           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5740         225 :   case ISD::FMA:
    5741             :     Opcode = AMDGPUISD::FMA_W_CHAIN;
    5742             :     break;
    5743             :   }
    5744             : 
    5745             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
    5746         225 :                      GlueChain.getValue(2));
    5747             : }
    5748             : 
    5749          27 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
    5750          27 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5751          24 :     return FastLowered;
    5752             : 
    5753             :   SDLoc SL(Op);
    5754           3 :   SDValue Src0 = Op.getOperand(0);
    5755           3 :   SDValue Src1 = Op.getOperand(1);
    5756             : 
    5757           3 :   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    5758           3 :   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    5759             : 
    5760           3 :   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
    5761           3 :   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
    5762             : 
    5763           3 :   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
    5764           3 :   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
    5765             : 
    5766           3 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
    5767             : }
    5768             : 
    5769             : // Faster 2.5 ULP division that does not support denormals.
    5770          33 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
    5771             :   SDLoc SL(Op);
    5772          33 :   SDValue LHS = Op.getOperand(1);
    5773          33 :   SDValue RHS = Op.getOperand(2);
    5774             : 
    5775          33 :   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
    5776             : 
    5777          33 :   const APFloat K0Val(BitsToFloat(0x6f800000));
    5778          33 :   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
    5779             : 
    5780          33 :   const APFloat K1Val(BitsToFloat(0x2f800000));
    5781          33 :   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
    5782             : 
    5783          33 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5784             : 
    5785             :   EVT SetCCVT =
    5786          66 :     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
    5787             : 
    5788          33 :   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
    5789             : 
    5790          33 :   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
    5791             : 
    5792             :   // TODO: Should this propagate fast-math-flags?
    5793          33 :   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
    5794             : 
    5795             :   // rcp does not support denormals.
    5796          33 :   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
    5797             : 
    5798          33 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
    5799             : 
    5800          66 :   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
    5801             : }
    5802             : 
    5803         140 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
    5804         140 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5805          79 :     return FastLowered;
    5806             : 
    5807             :   SDLoc SL(Op);
    5808          61 :   SDValue LHS = Op.getOperand(0);
    5809          61 :   SDValue RHS = Op.getOperand(1);
    5810             : 
    5811          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5812             : 
    5813          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
    5814             : 
    5815             :   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5816          61 :                                           RHS, RHS, LHS);
    5817             :   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5818          61 :                                         LHS, RHS, LHS);
    5819             : 
    5820             :   // Denominator is scaled to not be denormal, so using rcp is ok.
    5821             :   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
    5822          61 :                                   DenominatorScaled);
    5823             :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
    5824          61 :                                      DenominatorScaled);
    5825             : 
    5826             :   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
    5827             :                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
    5828             :                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
    5829             : 
    5830          61 :   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
    5831             : 
    5832          61 :   if (!Subtarget->hasFP32Denormals()) {
    5833          45 :     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
    5834             :     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
    5835          45 :                                                       SL, MVT::i32);
    5836             :     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
    5837             :                                        DAG.getEntryNode(),
    5838          45 :                                        EnableDenormValue, BitField);
    5839             :     SDValue Ops[3] = {
    5840             :       NegDivScale0,
    5841             :       EnableDenorm.getValue(0),
    5842             :       EnableDenorm.getValue(1)
    5843          45 :     };
    5844             : 
    5845          45 :     NegDivScale0 = DAG.getMergeValues(Ops, SL);
    5846             :   }
    5847             : 
    5848             :   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
    5849          61 :                              ApproxRcp, One, NegDivScale0);
    5850             : 
    5851             :   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
    5852          61 :                              ApproxRcp, Fma0);
    5853             : 
    5854             :   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
    5855          61 :                            Fma1, Fma1);
    5856             : 
    5857             :   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
    5858          61 :                              NumeratorScaled, Mul);
    5859             : 
    5860          61 :   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
    5861             : 
    5862             :   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
    5863          61 :                              NumeratorScaled, Fma3);
    5864             : 
    5865          61 :   if (!Subtarget->hasFP32Denormals()) {
    5866             :     const SDValue DisableDenormValue =
    5867          45 :         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
    5868             :     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
    5869             :                                         Fma4.getValue(1),
    5870             :                                         DisableDenormValue,
    5871             :                                         BitField,
    5872          45 :                                         Fma4.getValue(2));
    5873             : 
    5874             :     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    5875          45 :                                       DisableDenorm, DAG.getRoot());
    5876          45 :     DAG.setRoot(OutputChain);
    5877             :   }
    5878             : 
    5879          61 :   SDValue Scale = NumeratorScaled.getValue(1);
    5880             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
    5881          61 :                              Fma4, Fma1, Fma3, Scale);
    5882             : 
    5883          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
    5884             : }
    5885             : 
    5886          68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
    5887          68 :   if (DAG.getTarget().Options.UnsafeFPMath)
    5888           7 :     return lowerFastUnsafeFDIV(Op, DAG);
    5889             : 
    5890             :   SDLoc SL(Op);
    5891          61 :   SDValue X = Op.getOperand(0);
    5892          61 :   SDValue Y = Op.getOperand(1);
    5893             : 
    5894          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    5895             : 
    5896          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
    5897             : 
    5898          61 :   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
    5899             : 
    5900          61 :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
    5901             : 
    5902          61 :   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
    5903             : 
    5904          61 :   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
    5905             : 
    5906          61 :   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
    5907             : 
    5908          61 :   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
    5909             : 
    5910          61 :   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
    5911             : 
    5912          61 :   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
    5913          61 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
    5914             : 
    5915             :   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
    5916          61 :                              NegDivScale0, Mul, DivScale1);
    5917             : 
    5918          61 :   SDValue Scale;
    5919             : 
    5920          61 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
    5921             :     // Workaround a hardware bug on SI where the condition output from div_scale
    5922             :     // is not usable.
    5923             : 
    5924          23 :     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
    5925             : 
    5926             :     // Figure out if the scale to use for div_fmas.
    5927          23 :     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    5928          23 :     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
    5929          23 :     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
    5930          23 :     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
    5931             : 
    5932          23 :     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
    5933          23 :     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
    5934             : 
    5935             :     SDValue Scale0Hi
    5936          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
    5937             :     SDValue Scale1Hi
    5938          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
    5939             : 
    5940          23 :     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
    5941          23 :     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
    5942          23 :     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
    5943             :   } else {
    5944          38 :     Scale = DivScale1.getValue(1);
    5945             :   }
    5946             : 
    5947             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
    5948          61 :                              Fma4, Fma3, Mul, Scale);
    5949             : 
    5950          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
    5951             : }
    5952             : 
    5953         235 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
    5954         235 :   EVT VT = Op.getValueType();
    5955             : 
    5956             :   if (VT == MVT::f32)
    5957         140 :     return LowerFDIV32(Op, DAG);
    5958             : 
    5959             :   if (VT == MVT::f64)
    5960          68 :     return LowerFDIV64(Op, DAG);
    5961             : 
    5962             :   if (VT == MVT::f16)
    5963          27 :     return LowerFDIV16(Op, DAG);
    5964             : 
    5965           0 :   llvm_unreachable("Unexpected type for fdiv");
    5966             : }
    5967             : 
    5968       80302 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    5969             :   SDLoc DL(Op);
    5970             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    5971       80302 :   EVT VT = Store->getMemoryVT();
    5972             : 
    5973             :   if (VT == MVT::i1) {
    5974             :     return DAG.getTruncStore(Store->getChain(), DL,
    5975             :        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
    5976         573 :        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    5977             :   }
    5978             : 
    5979             :   assert(VT.isVector() &&
    5980             :          Store->getValue().getValueType().getScalarType() == MVT::i32);
    5981             : 
    5982             :   unsigned AS = Store->getAddressSpace();
    5983      240333 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
    5984             :                           AS, Store->getAlignment())) {
    5985          26 :     return expandUnalignedStore(Store, DAG);
    5986             :   }
    5987             : 
    5988       80085 :   MachineFunction &MF = DAG.getMachineFunction();
    5989       80085 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5990             :   // If there is a possibilty that flat instruction access scratch memory
    5991             :   // then we need to use the same legalization rules we use for private.
    5992       80085 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5993         263 :     AS = MFI->hasFlatScratchInit() ?
    5994             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5995             : 
    5996       80085 :   unsigned NumElements = VT.getVectorNumElements();
    5997       80085 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5998             :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5999       40570 :     if (NumElements > 4)
    6000        3834 :       return SplitVectorStore(Op, DAG);
    6001       36736 :     return SDValue();
    6002       39515 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    6003         605 :     switch (Subtarget->getMaxPrivateElementSize()) {
    6004         325 :     case 4:
    6005         325 :       return scalarizeVectorStore(Store, DAG);
    6006         126 :     case 8:
    6007         126 :       if (NumElements > 2)
    6008          10 :         return SplitVectorStore(Op, DAG);
    6009         116 :       return SDValue();
    6010         154 :     case 16:
    6011         154 :       if (NumElements > 4)
    6012           2 :         return SplitVectorStore(Op, DAG);
    6013         152 :       return SDValue();
    6014           0 :     default:
    6015           0 :       llvm_unreachable("unsupported private_element_size");
    6016             :     }
    6017       38910 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    6018             :     // Use ds_write_b128 if possible.
    6019       53760 :     if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
    6020             :         VT.getStoreSize() == 16)
    6021        4324 :       return SDValue();
    6022             : 
    6023       34586 :     if (NumElements > 2)
    6024        3740 :       return SplitVectorStore(Op, DAG);
    6025       30846 :     return SDValue();
    6026             :   } else {
    6027           0 :     llvm_unreachable("unhandled address space");
    6028             :   }
    6029             : }
    6030             : 
    6031          51 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    6032             :   SDLoc DL(Op);
    6033          51 :   EVT VT = Op.getValueType();
    6034          51 :   SDValue Arg = Op.getOperand(0);
    6035             :   // TODO: Should this propagate fast-math-flags?
    6036             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    6037             :                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
    6038             :                                               DAG.getConstantFP(0.5/M_PI, DL,
    6039          51 :                                                                 VT)));
    6040             : 
    6041          51 :   switch (Op.getOpcode()) {
    6042             :   case ISD::FCOS:
    6043          48 :     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
    6044             :   case ISD::FSIN:
    6045          54 :     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
    6046           0 :   default:
    6047           0 :     llvm_unreachable("Wrong trig opcode");
    6048             :   }
    6049             : }
    6050             : 
    6051         261 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    6052             :   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
    6053             :   assert(AtomicNode->isCompareAndSwap());
    6054             :   unsigned AS = AtomicNode->getAddressSpace();
    6055             : 
    6056             :   // No custom lowering required for local address space
    6057         261 :   if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
    6058          66 :     return Op;
    6059             : 
    6060             :   // Non-local address space requires custom lowering for atomic compare
    6061             :   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
    6062             :   SDLoc DL(Op);
    6063         195 :   SDValue ChainIn = Op.getOperand(0);
    6064         195 :   SDValue Addr = Op.getOperand(1);
    6065         195 :   SDValue Old = Op.getOperand(2);
    6066         195 :   SDValue New = Op.getOperand(3);
    6067         195 :   EVT VT = Op.getValueType();
    6068         195 :   MVT SimpleVT = VT.getSimpleVT();
    6069         195 :   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
    6070             : 
    6071         390 :   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
    6072         195 :   SDValue Ops[] = { ChainIn, Addr, NewOld };
    6073             : 
    6074             :   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
    6075         585 :                                  Ops, VT, AtomicNode->getMemOperand());
    6076             : }
    6077             : 
    6078             : //===----------------------------------------------------------------------===//
    6079             : // Custom DAG optimizations
    6080             : //===----------------------------------------------------------------------===//
    6081             : 
    6082        1124 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
    6083             :                                                      DAGCombinerInfo &DCI) const {
    6084        2248 :   EVT VT = N->getValueType(0);
    6085        1124 :   EVT ScalarVT = VT.getScalarType();
    6086        1124 :   if (ScalarVT != MVT::f32)
    6087         230 :     return SDValue();
    6088             : 
    6089         894 :   SelectionDAG &DAG = DCI.DAG;
    6090             :   SDLoc DL(N);
    6091             : 
    6092         894 :   SDValue Src = N->getOperand(0);
    6093             :   EVT SrcVT = Src.getValueType();
    6094             : 
    6095             :   // TODO: We could try to match extracting the higher bytes, which would be
    6096             :   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
    6097             :   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
    6098             :   // about in practice.
    6099         894 :   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
    6100         804 :     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
    6101         106 :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
    6102         106 :       DCI.AddToWorklist(Cvt.getNode());
    6103         106 :       return Cvt;
    6104             :     }
    6105             :   }
    6106             : 
    6107         788 :   return SDValue();
    6108             : }
    6109             : 
    6110             : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
    6111             : 
    6112             : // This is a variant of
    6113             : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
    6114             : //
    6115             : // The normal DAG combiner will do this, but only if the add has one use since
    6116             : // that would increase the number of instructions.
    6117             : //
    6118             : // This prevents us from seeing a constant offset that can be folded into a
    6119             : // memory instruction's addressing mode. If we know the resulting add offset of
    6120             : // a pointer can be folded into an addressing offset, we can replace the pointer
    6121             : // operand with the add of new constant offset. This eliminates one of the uses,
    6122             : // and may allow the remaining use to also be simplified.
    6123             : //
    6124         200 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    6125             :                                                unsigned AddrSpace,
    6126             :                                                EVT MemVT,
    6127             :                                                DAGCombinerInfo &DCI) const {
    6128         200 :   SDValue N0 = N->getOperand(0);
    6129         200 :   SDValue N1 = N->getOperand(1);
    6130             : 
    6131             :   // We only do this to handle cases where it's profitable when there are
    6132             :   // multiple uses of the add, so defer to the standard combine.
    6133         200 :   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
    6134             :       N0->hasOneUse())
    6135         154 :     return SDValue();
    6136             : 
    6137             :   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
    6138             :   if (!CN1)
    6139           0 :     return SDValue();
    6140             : 
    6141             :   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    6142             :   if (!CAdd)
    6143           2 :     return SDValue();
    6144             : 
    6145             :   // If the resulting offset is too large, we can't fold it into the addressing
    6146             :   // mode offset.
    6147          88 :   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    6148          44 :   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
    6149             : 
    6150          44 :   AddrMode AM;
    6151          44 :   AM.HasBaseReg = true;
    6152          44 :   AM.BaseOffs = Offset.getSExtValue();
    6153          88 :   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
    6154          14 :     return SDValue();
    6155             : 
    6156          30 :   SelectionDAG &DAG = DCI.DAG;
    6157             :   SDLoc SL(N);
    6158          60 :   EVT VT = N->getValueType(0);
    6159             : 
    6160          30 :   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
    6161          30 :   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
    6162             : 
    6163             :   SDNodeFlags Flags;
    6164          30 :   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
    6165           0 :                           (N0.getOpcode() == ISD::OR ||
    6166           0 :                            N0->getFlags().hasNoUnsignedWrap()));
    6167             : 
    6168          30 :   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
    6169             : }
    6170             : 
    6171      356516 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
    6172             :                                                   DAGCombinerInfo &DCI) const {
    6173      356516 :   SDValue Ptr = N->getBasePtr();
    6174      356516 :   SelectionDAG &DAG = DCI.DAG;
    6175             :   SDLoc SL(N);
    6176             : 
    6177             :   // TODO: We could also do this for multiplies.
    6178      356516 :   if (Ptr.getOpcode() == ISD::SHL) {
    6179             :     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
    6180         200 :                                           N->getMemoryVT(), DCI);
    6181         200 :     if (NewPtr) {
    6182          30 :       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
    6183             : 
    6184          60 :       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
    6185          30 :       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
    6186             :     }
    6187             :   }
    6188             : 
    6189      356486 :   return SDValue();
    6190             : }
    6191             : 
    6192             : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
    6193        4940 :   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
    6194        4008 :          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
    6195        2013 :          (Opc == ISD::XOR && Val == 0);
    6196             : }
    6197             : 
    6198             : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
    6199             : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
    6200             : // integer combine opportunities since most 64-bit operations are decomposed
    6201             : // this way.  TODO: We won't want this for SALU especially if it is an inline
    6202             : // immediate.
    6203        1995 : SDValue SITargetLowering::splitBinaryBitConstantOp(
    6204             :   DAGCombinerInfo &DCI,
    6205             :   const SDLoc &SL,
    6206             :   unsigned Opc, SDValue LHS,
    6207             :   const ConstantSDNode *CRHS) const {
    6208        1995 :   uint64_t Val = CRHS->getZExtValue();
    6209             :   uint32_t ValLo = Lo_32(Val);
    6210             :   uint32_t ValHi = Hi_32(Val);
    6211        1995 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6212             : 
    6213             :     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
    6214             :          bitOpWithConstantIsReducible(Opc, ValHi)) ||
    6215         374 :         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
    6216             :     // If we need to materialize a 64-bit immediate, it will be split up later
    6217             :     // anyway. Avoid creating the harder to understand 64-bit immediate
    6218             :     // materialization.
    6219        1626 :     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
    6220             :   }
    6221             : 
    6222         369 :   return SDValue();
    6223             : }
    6224             : 
    6225             : // Returns true if argument is a boolean value which is not serialized into
    6226             : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
    6227             : static bool isBoolSGPR(SDValue V) {
    6228             :   if (V.getValueType() != MVT::i1)
    6229             :     return false;
    6230         201 :   switch (V.getOpcode()) {
    6231             :   default: break;
    6232             :   case ISD::SETCC:
    6233             :   case ISD::AND:
    6234             :   case ISD::OR:
    6235             :   case ISD::XOR:
    6236             :   case AMDGPUISD::FP_CLASS:
    6237             :     return true;
    6238             :   }
    6239             :   return false;
    6240             : }
    6241             : 
    6242             : // If a constant has all zeroes or all ones within each byte return it.
    6243             : // Otherwise return 0.
    6244         330 : static uint32_t getConstantPermuteMask(uint32_t C) {
    6245             :   // 0xff for any zero byte in the mask
    6246             :   uint32_t ZeroByteMask = 0;
    6247         330 :   if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
    6248         330 :   if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
    6249         330 :   if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
    6250         330 :   if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
    6251         330 :   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
    6252         330 :   if ((NonZeroByteMask & C) != NonZeroByteMask)
    6253             :     return 0; // Partial bytes selected.
    6254         318 :   return C;
    6255             : }
    6256             : 
    6257             : // Check if a node selects whole bytes from its operand 0 starting at a byte
    6258             : // boundary while masking the rest. Returns select mask as in the v_perm_b32
    6259             : // or -1 if not succeeded.
    6260             : // Note byte select encoding:
    6261             : // value 0-3 selects corresponding source byte;
    6262             : // value 0xc selects zero;
    6263             : // value 0xff selects 0xff.
    6264        2438 : static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
    6265             :   assert(V.getValueSizeInBits() == 32);
    6266             : 
    6267        2438 :   if (V.getNumOperands() != 2)
    6268             :     return ~0;
    6269             : 
    6270             :   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
    6271             :   if (!N1)
    6272             :     return ~0;
    6273             : 
    6274        2328 :   uint32_t C = N1->getZExtValue();
    6275             : 
    6276        1164 :   switch (V.getOpcode()) {
    6277             :   default:
    6278             :     break;
    6279         323 :   case ISD::AND:
    6280         323 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6281         311 :       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
    6282             :     }
    6283             :     break;
    6284             : 
    6285           2 :   case ISD::OR:
    6286           2 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6287           2 :       return (0x03020100 & ~ConstMask) | ConstMask;
    6288             :     }
    6289             :     break;
    6290             : 
    6291         741 :   case ISD::SHL:
    6292         741 :     if (C % 8)
    6293             :       return ~0;
    6294             : 
    6295         667 :     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
    6296             : 
    6297          26 :   case ISD::SRL:
    6298          26 :     if (C % 8)
    6299             :       return ~0;
    6300             : 
    6301          26 :     return uint32_t(0x0c0c0c0c03020100ull >> C);
    6302             :   }
    6303             : 
    6304             :   return ~0;
    6305             : }
    6306             : 
    6307       30911 : SDValue SITargetLowering::performAndCombine(SDNode *N,
    6308             :                                             DAGCombinerInfo &DCI) const {
    6309       30911 :   if (DCI.isBeforeLegalize())
    6310         877 :     return SDValue();
    6311             : 
    6312       30034 :   SelectionDAG &DAG = DCI.DAG;
    6313       60068 :   EVT VT = N->getValueType(0);
    6314       30034 :   SDValue LHS = N->getOperand(0);
    6315       30034 :   SDValue RHS = N->getOperand(1);
    6316             : 
    6317             : 
    6318             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6319        1841 :   if (VT == MVT::i64 && CRHS) {
    6320        1603 :     if (SDValue Split
    6321        3206 :         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
    6322        1523 :       return Split;
    6323             :   }
    6324             : 
    6325       28511 :   if (CRHS && VT == MVT::i32) {
    6326             :     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
    6327             :     // nb = number of trailing zeroes in mask
    6328             :     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
    6329             :     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
    6330       24367 :     uint64_t Mask = CRHS->getZExtValue();
    6331             :     unsigned Bits = countPopulation(Mask);
    6332       33579 :     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
    6333       26853 :         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
    6334          54 :       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
    6335         108 :         unsigned Shift = CShift->getZExtValue();
    6336         108 :         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
    6337          54 :         unsigned Offset = NB + Shift;
    6338          54 :         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
    6339             :           SDLoc SL(N);
    6340             :           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    6341          54 :                                     LHS->getOperand(0),
    6342             :                                     DAG.getConstant(Offset, SL, MVT::i32),
    6343         162 :                                     DAG.getConstant(Bits, SL, MVT::i32));
    6344          54 :           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
    6345             :           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
    6346          54 :                                     DAG.getValueType(NarrowVT));
    6347          54 :           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
    6348         162 :                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
    6349          54 :           return Shl;
    6350             :         }
    6351             :       }
    6352             :     }
    6353             : 
    6354             :     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    6355       37267 :     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
    6356             :         isa<ConstantSDNode>(LHS.getOperand(2))) {
    6357           2 :       uint32_t Sel = getConstantPermuteMask(Mask);
    6358           2 :       if (!Sel)
    6359           0 :         return SDValue();
    6360             : 
    6361             :       // Select 0xc for all zero bytes
    6362           2 :       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
    6363             :       SDLoc DL(N);
    6364             :       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    6365           4 :                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    6366             :     }
    6367             :   }
    6368             : 
    6369             :   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
    6370             :   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
    6371       28595 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
    6372         140 :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    6373         140 :     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
    6374             : 
    6375         140 :     SDValue X = LHS.getOperand(0);
    6376         140 :     SDValue Y = RHS.getOperand(0);
    6377         140 :     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
    6378         134 :       return SDValue();
    6379             : 
    6380           6 :     if (LCC == ISD::SETO) {
    6381             :       if (X != LHS.getOperand(1))
    6382           0 :         return SDValue();
    6383             : 
    6384           4 :       if (RCC == ISD::SETUNE) {
    6385             :         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
    6386           6 :         if (!C1 || !C1->isInfinity() || C1->isNegative())
    6387           0 :           return SDValue();
    6388             : 
    6389             :         const uint32_t Mask = SIInstrFlags::N_NORMAL |
    6390             :                               SIInstrFlags::N_SUBNORMAL |
    6391             :                               SIInstrFlags::N_ZERO |
    6392             :                               SIInstrFlags::P_ZERO |
    6393             :                               SIInstrFlags::P_SUBNORMAL |
    6394             :                               SIInstrFlags::P_NORMAL;
    6395             : 
    6396             :         static_assert(((~(SIInstrFlags::S_NAN |
    6397             :                           SIInstrFlags::Q_NAN |
    6398             :                           SIInstrFlags::N_INFINITY |
    6399             :                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
    6400             :                       "mask not equal");
    6401             : 
    6402             :         SDLoc DL(N);
    6403             :         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6404           4 :                            X, DAG.getConstant(Mask, DL, MVT::i32));
    6405             :       }
    6406             :     }
    6407             :   }
    6408             : 
    6409       24608 :   if (VT == MVT::i32 &&
    6410       24607 :       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
    6411             :     // and x, (sext cc from i1) => select cc, x, 0
    6412          24 :     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
    6413             :       std::swap(LHS, RHS);
    6414          24 :     if (isBoolSGPR(RHS.getOperand(0)))
    6415          16 :       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
    6416          64 :                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
    6417             :   }
    6418             : 
    6419             :   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    6420       28303 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6421       39826 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    6422         549 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    6423         259 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    6424         259 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    6425         259 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    6426             :       // Canonicalize the expression in an attempt to have fewer unique masks
    6427             :       // and therefore fewer registers used to hold the masks.
    6428           1 :       if (LHSMask > RHSMask) {
    6429             :         std::swap(LHSMask, RHSMask);
    6430             :         std::swap(LHS, RHS);
    6431             :       }
    6432             : 
    6433             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    6434             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    6435           1 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6436           1 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6437             : 
    6438             :       // Check of we need to combine values from two sources within a byte.
    6439           2 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    6440             :           // If we select high and lower word keep it for SDWA.
    6441             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    6442           1 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    6443             :         // Each byte in each mask is either selector mask 0-3, or has higher
    6444             :         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
    6445             :         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
    6446             :         // mask which is not 0xff wins. By anding both masks we have a correct
    6447             :         // result except that 0x0c shall be corrected to give 0x0c only.
    6448           1 :         uint32_t Mask = LHSMask & RHSMask;
    6449           9 :         for (unsigned I = 0; I < 32; I += 8) {
    6450           4 :           uint32_t ByteSel = 0xff << I;
    6451           4 :           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
    6452           0 :             Mask &= (0x0c << I) & 0xffffffff;
    6453             :         }
    6454             : 
    6455             :         // Add 4 to each active LHS lane. It will not affect any existing 0xff
    6456             :         // or 0x0c.
    6457           1 :         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
    6458             :         SDLoc DL(N);
    6459             : 
    6460             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    6461             :                            LHS.getOperand(0), RHS.getOperand(0),
    6462           2 :                            DAG.getConstant(Sel, DL, MVT::i32));
    6463             :       }
    6464             :     }
    6465             :   }
    6466             : 
    6467       28302 :   return SDValue();
    6468             : }
    6469             : 
    6470       16697 : SDValue SITargetLowering::performOrCombine(SDNode *N,
    6471             :                                            DAGCombinerInfo &DCI) const {
    6472       16697 :   SelectionDAG &DAG = DCI.DAG;
    6473       16697 :   SDValue LHS = N->getOperand(0);
    6474       16697 :   SDValue RHS = N->getOperand(1);
    6475             : 
    6476       16697 :   EVT VT = N->getValueType(0);
    6477             :   if (VT == MVT::i1) {
    6478             :     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
    6479         127 :     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    6480             :         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
    6481          14 :       SDValue Src = LHS.getOperand(0);
    6482             :       if (Src != RHS.getOperand(0))
    6483           1 :         return SDValue();
    6484             : 
    6485             :       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    6486             :       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    6487          13 :       if (!CLHS || !CRHS)
    6488           0 :         return SDValue();
    6489             : 
    6490             :       // Only 10 bits are used.
    6491             :       static const uint32_t MaxMask = 0x3ff;
    6492             : 
    6493          39 :       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
    6494             :       SDLoc DL(N);
    6495             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6496          26 :                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
    6497             :     }
    6498             : 
    6499          99 :     return SDValue();
    6500             :   }
    6501             : 
    6502             :   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    6503        5807 :   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
    6504             :       LHS.getOpcode() == AMDGPUISD::PERM &&
    6505             :       isa<ConstantSDNode>(LHS.getOperand(2))) {
    6506           3 :     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
    6507           3 :     if (!Sel)
    6508           0 :       return SDValue();
    6509             : 
    6510           3 :     Sel |= LHS.getConstantOperandVal(2);
    6511             :     SDLoc DL(N);
    6512             :     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    6513           6 :                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    6514             :   }
    6515             : 
    6516             :   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    6517       16581 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6518       31797 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    6519        1802 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    6520         960 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    6521         960 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    6522         960 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    6523             :       // Canonicalize the expression in an attempt to have fewer unique masks
    6524             :       // and therefore fewer registers used to hold the masks.
    6525         118 :       if (LHSMask > RHSMask) {
    6526             :         std::swap(LHSMask, RHSMask);
    6527             :         std::swap(LHS, RHS);
    6528             :       }
    6529             : 
    6530             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    6531             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    6532         118 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6533         118 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6534             : 
    6535             :       // Check of we need to combine values from two sources within a byte.
    6536         236 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    6537             :           // If we select high and lower word keep it for SDWA.
    6538             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    6539         118 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    6540             :         // Kill zero bytes selected by other mask. Zero value is 0xc.
    6541             :         LHSMask &= ~RHSUsedLanes;
    6542          12 :         RHSMask &= ~LHSUsedLanes;
    6543             :         // Add 4 to each active LHS lane
    6544          12 :         LHSMask |= LHSUsedLanes & 0x04040404;
    6545             :         // Combine masks
    6546          12 :         uint32_t Sel = LHSMask | RHSMask;
    6547             :         SDLoc DL(N);
    6548             : 
    6549             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    6550             :                            LHS.getOperand(0), RHS.getOperand(0),
    6551          24 :                            DAG.getConstant(Sel, DL, MVT::i32));
    6552             :       }
    6553             :     }
    6554             :   }
    6555             : 
    6556             :   if (VT != MVT::i64)
    6557       14798 :     return SDValue();
    6558             : 
    6559             :   // TODO: This could be a generic combine with a predicate for extracting the
    6560             :   // high half of an integer being free.
    6561             : 
    6562             :   // (or i64:x, (zero_extend i32:y)) ->
    6563             :   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
    6564        2624 :   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
    6565             :       RHS.getOpcode() != ISD::ZERO_EXTEND)
    6566             :     std::swap(LHS, RHS);
    6567             : 
    6568        1771 :   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
    6569         886 :     SDValue ExtSrc = RHS.getOperand(0);
    6570             :     EVT SrcVT = ExtSrc.getValueType();
    6571             :     if (SrcVT == MVT::i32) {
    6572             :       SDLoc SL(N);
    6573             :       SDValue LowLHS, HiBits;
    6574        1772 :       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
    6575         886 :       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
    6576             : 
    6577         886 :       DCI.AddToWorklist(LowOr.getNode());
    6578         886 :       DCI.AddToWorklist(HiBits.getNode());
    6579             : 
    6580             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    6581         886 :                                 LowOr, HiBits);
    6582         886 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    6583             :     }
    6584             :   }
    6585             : 
    6586         885 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    6587             :   if (CRHS) {
    6588          83 :     if (SDValue Split
    6589         166 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
    6590          77 :       return Split;
    6591             :   }
    6592             : 
    6593         808 :   return SDValue();
    6594             : }
    6595             : 
    6596        1484 : SDValue SITargetLowering::performXorCombine(SDNode *N,
    6597             :                                             DAGCombinerInfo &DCI) const {
    6598        1484 :   EVT VT = N->getValueType(0);
    6599             :   if (VT != MVT::i64)
    6600         879 :     return SDValue();
    6601             : 
    6602         605 :   SDValue LHS = N->getOperand(0);
    6603         605 :   SDValue RHS = N->getOperand(1);
    6604             : 
    6605             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6606             :   if (CRHS) {
    6607         309 :     if (SDValue Split
    6608         618 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
    6609          26 :       return Split;
    6610             :   }
    6611             : 
    6612         579 :   return SDValue();
    6613             : }
    6614             : 
    6615             : // Instructions that will be lowered with a final instruction that zeros the
    6616             : // high result bits.
    6617             : // XXX - probably only need to list legal operations.
    6618         248 : static bool fp16SrcZerosHighBits(unsigned Opc) {
    6619         248 :   switch (Opc) {
    6620             :   case ISD::FADD:
    6621             :   case ISD::FSUB:
    6622             :   case ISD::FMUL:
    6623             :   case ISD::FDIV:
    6624             :   case ISD::FREM:
    6625             :   case ISD::FMA:
    6626             :   case ISD::FMAD:
    6627             :   case ISD::FCANONICALIZE:
    6628             :   case ISD::FP_ROUND:
    6629             :   case ISD::UINT_TO_FP:
    6630             :   case ISD::SINT_TO_FP:
    6631             :   case ISD::FABS:
    6632             :     // Fabs is lowered to a bit operation, but it's an and which will clear the
    6633             :     // high bits anyway.
    6634             :   case ISD::FSQRT:
    6635             :   case ISD::FSIN:
    6636             :   case ISD::FCOS:
    6637             :   case ISD::FPOWI:
    6638             :   case ISD::FPOW:
    6639             :   case ISD::FLOG:
    6640             :   case ISD::FLOG2:
    6641             :   case ISD::FLOG10:
    6642             :   case ISD::FEXP:
    6643             :   case ISD::FEXP2:
    6644             :   case ISD::FCEIL:
    6645             :   case ISD::FTRUNC:
    6646             :   case ISD::FRINT:
    6647             :   case ISD::FNEARBYINT:
    6648             :   case ISD::FROUND:
    6649             :   case ISD::FFLOOR:
    6650             :   case ISD::FMINNUM:
    6651             :   case ISD::FMAXNUM:
    6652             :   case AMDGPUISD::FRACT:
    6653             :   case AMDGPUISD::CLAMP:
    6654             :   case AMDGPUISD::COS_HW:
    6655             :   case AMDGPUISD::SIN_HW:
    6656             :   case AMDGPUISD::FMIN3:
    6657             :   case AMDGPUISD::FMAX3:
    6658             :   case AMDGPUISD::FMED3:
    6659             :   case AMDGPUISD::FMAD_FTZ:
    6660             :   case AMDGPUISD::RCP:
    6661             :   case AMDGPUISD::RSQ:
    6662             :   case AMDGPUISD::LDEXP:
    6663             :     return true;
    6664          35 :   default:
    6665             :     // fcopysign, select and others may be lowered to 32-bit bit operations
    6666             :     // which don't zero the high bits.
    6667          35 :     return false;
    6668             :   }
    6669             : }
    6670             : 
    6671       18492 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
    6672             :                                                    DAGCombinerInfo &DCI) const {
    6673       32460 :   if (!Subtarget->has16BitInsts() ||
    6674       13968 :       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6675       15086 :     return SDValue();
    6676             : 
    6677        6812 :   EVT VT = N->getValueType(0);
    6678             :   if (VT != MVT::i32)
    6679        1745 :     return SDValue();
    6680             : 
    6681        1661 :   SDValue Src = N->getOperand(0);
    6682             :   if (Src.getValueType() != MVT::i16)
    6683         269 :     return SDValue();
    6684             : 
    6685             :   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
    6686             :   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
    6687        1392 :   if (Src.getOpcode() == ISD::BITCAST) {
    6688         248 :     SDValue BCSrc = Src.getOperand(0);
    6689         248 :     if (BCSrc.getValueType() == MVT::f16 &&
    6690         248 :         fp16SrcZerosHighBits(BCSrc.getOpcode()))
    6691         639 :       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
    6692             :   }
    6693             : 
    6694        1179 :   return SDValue();
    6695             : }
    6696             : 
    6697          85 : SDValue SITargetLowering::performClassCombine(SDNode *N,
    6698             :                                               DAGCombinerInfo &DCI) const {
    6699          85 :   SelectionDAG &DAG = DCI.DAG;
    6700          85 :   SDValue Mask = N->getOperand(1);
    6701             : 
    6702             :   // fp_class x, 0 -> false
    6703             :   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
    6704         126 :     if (CMask->isNullValue())
    6705           4 :       return DAG.getConstant(0, SDLoc(N), MVT::i1);
    6706             :   }
    6707             : 
    6708         166 :   if (N->getOperand(0).isUndef())
    6709           2 :     return DAG.getUNDEF(MVT::i1);
    6710             : 
    6711          81 :   return SDValue();
    6712             : }
    6713             : 
    6714             : static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
    6715          61 :   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
    6716             :     return true;
    6717             : 
    6718          29 :   return DAG.isKnownNeverNaN(Op);
    6719             : }
    6720             : 
    6721         438 : static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
    6722             :                             const SISubtarget *ST, unsigned MaxDepth=5) {
    6723             :   // If source is a result of another standard FP operation it is already in
    6724             :   // canonical form.
    6725             : 
    6726         438 :   switch (Op.getOpcode()) {
    6727             :   default:
    6728             :     break;
    6729             : 
    6730             :   // These will flush denorms if required.
    6731             :   case ISD::FADD:
    6732             :   case ISD::FSUB:
    6733             :   case ISD::FMUL:
    6734             :   case ISD::FSQRT:
    6735             :   case ISD::FCEIL:
    6736             :   case ISD::FFLOOR:
    6737             :   case ISD::FMA:
    6738             :   case ISD::FMAD:
    6739             : 
    6740             :   case ISD::FCANONICALIZE:
    6741             :     return true;
    6742             : 
    6743             :   case ISD::FP_ROUND:
    6744          36 :     return Op.getValueType().getScalarType() != MVT::f16 ||
    6745          16 :            ST->hasFP16Denormals();
    6746             : 
    6747             :   case ISD::FP_EXTEND:
    6748          20 :     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
    6749           4 :            ST->hasFP16Denormals();
    6750             : 
    6751           0 :   case ISD::FP16_TO_FP:
    6752             :   case ISD::FP_TO_FP16:
    6753           0 :     return ST->hasFP16Denormals();
    6754             : 
    6755             :   // It can/will be lowered or combined as a bit operation.
    6756             :   // Need to check their input recursively to handle.
    6757          82 :   case ISD::FNEG:
    6758             :   case ISD::FABS:
    6759         164 :     return (MaxDepth > 0) &&
    6760         164 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
    6761             : 
    6762             :   case ISD::FSIN:
    6763             :   case ISD::FCOS:
    6764             :   case ISD::FSINCOS:
    6765          32 :     return Op.getValueType().getScalarType() != MVT::f16;
    6766             : 
    6767             :   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
    6768             :   // For such targets need to check their input recursively.
    6769          44 :   case ISD::FMINNUM:
    6770             :   case ISD::FMAXNUM:
    6771             :   case ISD::FMINNAN:
    6772             :   case ISD::FMAXNAN:
    6773             : 
    6774          22 :     if (ST->supportsMinMaxDenormModes() &&
    6775          66 :         DAG.isKnownNeverNaN(Op.getOperand(0)) &&
    6776           0 :         DAG.isKnownNeverNaN(Op.getOperand(1)))
    6777             :       return true;
    6778             : 
    6779          44 :     return (MaxDepth > 0) &&
    6780         100 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
    6781          12 :            isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
    6782             : 
    6783             :   case ISD::ConstantFP: {
    6784          12 :     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
    6785          24 :     return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
    6786             :   }
    6787             :   }
    6788             :   return false;
    6789             : }
    6790             : 
    6791             : // Constant fold canonicalize.
    6792         412 : SDValue SITargetLowering::performFCanonicalizeCombine(
    6793             :   SDNode *N,
    6794             :   DAGCombinerInfo &DCI) const {
    6795         412 :   SelectionDAG &DAG = DCI.DAG;
    6796         824 :   ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
    6797             : 
    6798         412 :   if (!CFP) {
    6799         312 :     SDValue N0 = N->getOperand(0);
    6800         312 :     EVT VT = N0.getValueType().getScalarType();
    6801         312 :     auto ST = getSubtarget();
    6802             : 
    6803         168 :     if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
    6804          34 :          (VT == MVT::f64 && ST->hasFP64Denormals()) ||
    6805         279 :          (VT == MVT::f16 && ST->hasFP16Denormals())) &&
    6806         169 :         DAG.isKnownNeverNaN(N0))
    6807          10 :       return N0;
    6808             : 
    6809         302 :     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
    6810             : 
    6811         605 :     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
    6812         300 :         isCanonicalized(DAG, N0, ST))
    6813          94 :       return N0;
    6814             : 
    6815         208 :     return SDValue();
    6816             :   }
    6817             : 
    6818         100 :   const APFloat &C = CFP->getValueAPF();
    6819             : 
    6820             :   // Flush denormals to 0 if not enabled.
    6821         100 :   if (C.isDenormal()) {
    6822          48 :     EVT VT = N->getValueType(0);
    6823          24 :     EVT SVT = VT.getScalarType();
    6824           4 :     if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
    6825           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6826             : 
    6827           4 :     if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
    6828           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6829             : 
    6830          16 :     if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
    6831           0 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6832             :   }
    6833             : 
    6834          96 :   if (C.isNaN()) {
    6835          84 :     EVT VT = N->getValueType(0);
    6836             :     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
    6837          42 :     if (C.isSignaling()) {
    6838             :       // Quiet a signaling NaN.
    6839          44 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6840             :     }
    6841             : 
    6842             :     // Make sure it is the canonical NaN bitpattern.
    6843             :     //
    6844             :     // TODO: Can we use -1 as the canonical NaN value since it's an inline
    6845             :     // immediate?
    6846          60 :     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
    6847          28 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6848             :   }
    6849             : 
    6850          60 :   return N->getOperand(0);
    6851             : }
    6852             : 
    6853             : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
    6854          59 :   switch (Opc) {
    6855             :   case ISD::FMAXNUM:
    6856             :     return AMDGPUISD::FMAX3;
    6857           5 :   case ISD::SMAX:
    6858             :     return AMDGPUISD::SMAX3;
    6859           5 :   case ISD::UMAX:
    6860             :     return AMDGPUISD::UMAX3;
    6861          12 :   case ISD::FMINNUM:
    6862             :     return AMDGPUISD::FMIN3;
    6863          15 :   case ISD::SMIN:
    6864             :     return AMDGPUISD::SMIN3;
    6865           8 :   case ISD::UMIN:
    6866             :     return AMDGPUISD::UMIN3;
    6867           0 :   default:
    6868           0 :     llvm_unreachable("Not a min/max opcode");
    6869             :   }
    6870             : }
    6871             : 
    6872         150 : SDValue SITargetLowering::performIntMed3ImmCombine(
    6873             :   SelectionDAG &DAG, const SDLoc &SL,
    6874             :   SDValue Op0, SDValue Op1, bool Signed) const {
    6875             :   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
    6876             :   if (!K1)
    6877          90 :     return SDValue();
    6878             : 
    6879             :   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
    6880             :   if (!K0)
    6881           3 :     return SDValue();
    6882             : 
    6883          57 :   if (Signed) {
    6884         144 :     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
    6885           3 :       return SDValue();
    6886             :   } else {
    6887          27 :     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
    6888           3 :       return SDValue();
    6889             :   }
    6890             : 
    6891         102 :   EVT VT = K0->getValueType(0);
    6892          51 :   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
    6893           8 :   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
    6894             :     return DAG.getNode(Med3Opc, SL, VT,
    6895          49 :                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
    6896             :   }
    6897             : 
    6898             :   // If there isn't a 16-bit med3 operation, convert to 32-bit.
    6899             :   MVT NVT = MVT::i32;
    6900           2 :   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    6901             : 
    6902           2 :   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
    6903           4 :   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
    6904           2 :   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
    6905             : 
    6906           2 :   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
    6907           2 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
    6908             : }
    6909             : 
    6910         778 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
    6911             :   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
    6912             :     return C;
    6913             : 
    6914             :   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
    6915          43 :     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
    6916             :       return C;
    6917             :   }
    6918             : 
    6919             :   return nullptr;
    6920             : }
    6921             : 
    6922         455 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    6923             :                                                   const SDLoc &SL,
    6924             :                                                   SDValue Op0,
    6925             :                                                   SDValue Op1) const {
    6926         455 :   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
    6927         455 :   if (!K1)
    6928         132 :     return SDValue();
    6929             : 
    6930         323 :   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
    6931         323 :   if (!K0)
    6932           3 :     return SDValue();
    6933             : 
    6934             :   // Ordered >= (although NaN inputs should have folded away by now).
    6935         960 :   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
    6936         320 :   if (Cmp == APFloat::cmpGreaterThan)
    6937           8 :     return SDValue();
    6938             : 
    6939             :   // TODO: Check IEEE bit enabled?
    6940         624 :   EVT VT = Op0.getValueType();
    6941         312 :   if (Subtarget->enableDX10Clamp()) {
    6942             :     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
    6943             :     // hardware fmed3 behavior converting to a min.
    6944             :     // FIXME: Should this be allowing -0.0?
    6945         859 :     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
    6946         250 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
    6947             :   }
    6948             : 
    6949             :   // med3 for f16 is only available on gfx9+, and not available for v2f16.
    6950          10 :   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
    6951             :     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    6952             :     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
    6953             :     // then give the other result, which is different from med3 with a NaN
    6954             :     // input.
    6955          53 :     SDValue Var = Op0.getOperand(0);
    6956          26 :     if (!isKnownNeverSNan(DAG, Var))
    6957          15 :       return SDValue();
    6958             : 
    6959             :     return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
    6960          76 :                        Var, SDValue(K0, 0), SDValue(K1, 0));
    6961             :   }
    6962             : 
    6963           9 :   return SDValue();
    6964             : }
    6965             : 
    6966        3335 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    6967             :                                                DAGCombinerInfo &DCI) const {
    6968        3335 :   SelectionDAG &DAG = DCI.DAG;
    6969             : 
    6970        6670 :   EVT VT = N->getValueType(0);
    6971        3335 :   unsigned Opc = N->getOpcode();
    6972        3335 :   SDValue Op0 = N->getOperand(0);
    6973        3335 :   SDValue Op1 = N->getOperand(1);
    6974             : 
    6975             :   // Only do this if the inner op has one use since this will just increases
    6976             :   // register pressure for no benefit.
    6977             : 
    6978             : 
    6979        6620 :   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
    6980        3335 :       !VT.isVector() && VT != MVT::f64 &&
    6981         710 :       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
    6982             :     // max(max(a, b), c) -> max3(a, b, c)
    6983             :     // min(min(a, b), c) -> min3(a, b, c)
    6984        2789 :     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
    6985             :       SDLoc DL(N);
    6986             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    6987             :                          DL,
    6988             :                          N->getValueType(0),
    6989             :                          Op0.getOperand(0),
    6990             :                          Op0.getOperand(1),
    6991          78 :                          Op1);
    6992             :     }
    6993             : 
    6994             :     // Try commuted.
    6995             :     // max(a, max(b, c)) -> max3(a, b, c)
    6996             :     // min(a, min(b, c)) -> min3(a, b, c)
    6997        2725 :     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
    6998             :       SDLoc DL(N);
    6999             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    7000             :                          DL,
    7001             :                          N->getValueType(0),
    7002             :                          Op0,
    7003             :                          Op1.getOperand(0),
    7004          40 :                          Op1.getOperand(1));
    7005             :     }
    7006             :   }
    7007             : 
    7008             :   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
    7009        3880 :   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
    7010         180 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
    7011          45 :       return Med3;
    7012             :   }
    7013             : 
    7014        3661 :   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
    7015         120 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
    7016           6 :       return Med3;
    7017             :   }
    7018             : 
    7019             :   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
    7020         988 :   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
    7021          21 :        (Opc == AMDGPUISD::FMIN_LEGACY &&
    7022             :         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
    7023             :       (VT == MVT::f32 || VT == MVT::f64 ||
    7024          72 :        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
    7025        3717 :        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
    7026             :       Op0.hasOneUse()) {
    7027         910 :     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
    7028         288 :       return Res;
    7029             :   }
    7030             : 
    7031        2937 :   return SDValue();
    7032             : }
    7033             : 
    7034         160 : static bool isClampZeroToOne(SDValue A, SDValue B) {
    7035             :   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
    7036             :     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
    7037             :       // FIXME: Should this be allowing -0.0?
    7038         259 :       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
    7039          74 :              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
    7040             :     }
    7041             :   }
    7042             : 
    7043             :   return false;
    7044             : }
    7045             : 
    7046             : // FIXME: Should only worry about snans for version with chain.
    7047         107 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
    7048             :                                               DAGCombinerInfo &DCI) const {
    7049         214 :   EVT VT = N->getValueType(0);
    7050             :   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
    7051             :   // NaNs. With a NaN input, the order of the operands may change the result.
    7052             : 
    7053         107 :   SelectionDAG &DAG = DCI.DAG;
    7054             :   SDLoc SL(N);
    7055             : 
    7056         107 :   SDValue Src0 = N->getOperand(0);
    7057         107 :   SDValue Src1 = N->getOperand(1);
    7058         107 :   SDValue Src2 = N->getOperand(2);
    7059             : 
    7060         107 :   if (isClampZeroToOne(Src0, Src1)) {
    7061             :     // const_a, const_b, x -> clamp is safe in all cases including signaling
    7062             :     // nans.
    7063             :     // FIXME: Should this be allowing -0.0?
    7064          36 :     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
    7065             :   }
    7066             : 
    7067             :   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
    7068             :   // handling no dx10-clamp?
    7069          71 :   if (Subtarget->enableDX10Clamp()) {
    7070             :     // If NaNs is clamped to 0, we are free to reorder the inputs.
    7071             : 
    7072             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7073             :       std::swap(Src0, Src1);
    7074             : 
    7075             :     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
    7076             :       std::swap(Src1, Src2);
    7077             : 
    7078             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7079             :       std::swap(Src0, Src1);
    7080             : 
    7081          53 :     if (isClampZeroToOne(Src1, Src2))
    7082          12 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
    7083             :   }
    7084             : 
    7085          59 :   return SDValue();
    7086             : }
    7087             : 
    7088         139 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    7089             :                                                  DAGCombinerInfo &DCI) const {
    7090         139 :   SDValue Src0 = N->getOperand(0);
    7091         139 :   SDValue Src1 = N->getOperand(1);
    7092         156 :   if (Src0.isUndef() && Src1.isUndef())
    7093           6 :     return DCI.DAG.getUNDEF(N->getValueType(0));
    7094         136 :   return SDValue();
    7095             : }
    7096             : 
    7097      143834 : SDValue SITargetLowering::performExtractVectorEltCombine(
    7098             :   SDNode *N, DAGCombinerInfo &DCI) const {
    7099      143834 :   SDValue Vec = N->getOperand(0);
    7100      143834 :   SelectionDAG &DAG = DCI.DAG;
    7101             : 
    7102      143834 :   EVT VecVT = Vec.getValueType();
    7103      143834 :   EVT EltVT = VecVT.getVectorElementType();
    7104             : 
    7105      143762 :   if ((Vec.getOpcode() == ISD::FNEG ||
    7106      143944 :        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
    7107             :     SDLoc SL(N);
    7108         156 :     EVT EltVT = N->getValueType(0);
    7109          78 :     SDValue Idx = N->getOperand(1);
    7110             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7111          78 :                               Vec.getOperand(0), Idx);
    7112          78 :     return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
    7113             :   }
    7114             : 
    7115             :   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
    7116             :   //    =>
    7117             :   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
    7118             :   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
    7119             :   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
    7120      154060 :   if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
    7121             :     SDLoc SL(N);
    7122        1714 :     EVT EltVT = N->getValueType(0);
    7123         857 :     SDValue Idx = N->getOperand(1);
    7124             :     unsigned Opc = Vec.getOpcode();
    7125             : 
    7126         857 :     switch(Opc) {
    7127         828 :     default:
    7128         828 :       return SDValue();
    7129             :       // TODO: Support other binary operations.
    7130             :     case ISD::FADD:
    7131             :     case ISD::ADD:
    7132             :     case ISD::UMIN:
    7133             :     case ISD::UMAX:
    7134             :     case ISD::SMIN:
    7135             :     case ISD::SMAX:
    7136             :     case ISD::FMAXNUM:
    7137             :     case ISD::FMINNUM:
    7138             :       return DAG.getNode(Opc, SL, EltVT,
    7139             :                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7140             :                                      Vec.getOperand(0), Idx),
    7141             :                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7142          58 :                                      Vec.getOperand(1), Idx));
    7143             :     }
    7144             :   }
    7145             : 
    7146      142899 :   if (!DCI.isBeforeLegalize())
    7147      133142 :     return SDValue();
    7148             : 
    7149        9757 :   unsigned VecSize = VecVT.getSizeInBits();
    7150        9757 :   unsigned EltSize = EltVT.getSizeInBits();
    7151             : 
    7152             :   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
    7153             :   // elements. This exposes more load reduction opportunities by replacing
    7154             :   // multiple small extract_vector_elements with a single 32-bit extract.
    7155        9757 :   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
    7156         565 :   if (EltSize <= 16 &&
    7157         557 :       EltVT.isByteSized() &&
    7158         390 :       VecSize > 32 &&
    7159       10471 :       VecSize % 32 == 0 &&
    7160             :       Idx) {
    7161         324 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
    7162             : 
    7163         648 :     unsigned BitIndex = Idx->getZExtValue() * EltSize;
    7164         324 :     unsigned EltIdx = BitIndex / 32;
    7165         324 :     unsigned LeftoverBitIdx = BitIndex % 32;
    7166             :     SDLoc SL(N);
    7167             : 
    7168         324 :     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
    7169         324 :     DCI.AddToWorklist(Cast.getNode());
    7170             : 
    7171             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
    7172         648 :                               DAG.getConstant(EltIdx, SL, MVT::i32));
    7173         324 :     DCI.AddToWorklist(Elt.getNode());
    7174             :     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
    7175         648 :                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
    7176         324 :     DCI.AddToWorklist(Srl.getNode());
    7177             : 
    7178         324 :     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
    7179         324 :     DCI.AddToWorklist(Trunc.getNode());
    7180         324 :     return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
    7181             :   }
    7182             : 
    7183        9433 :   return SDValue();
    7184             : }
    7185             : 
    7186        3308 : static bool convertBuildVectorCastElt(SelectionDAG &DAG,
    7187             :                                       SDValue &Lo, SDValue &Hi) {
    7188        3308 :   if (Hi.getOpcode() == ISD::BITCAST &&
    7189        3314 :       Hi.getOperand(0).getValueType() == MVT::f16 &&
    7190           4 :       (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
    7191           4 :     Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
    7192           4 :     Hi = Hi.getOperand(0);
    7193           2 :     return true;
    7194             :   }
    7195             : 
    7196             :   return false;
    7197             : }
    7198             : 
    7199      112989 : SDValue SITargetLowering::performBuildVectorCombine(
    7200             :   SDNode *N, DAGCombinerInfo &DCI) const {
    7201             :   SDLoc SL(N);
    7202             : 
    7203             :   if (!isTypeLegal(MVT::v2i16))
    7204       50949 :     return SDValue();
    7205       62040 :   SelectionDAG &DAG = DCI.DAG;
    7206      124080 :   EVT VT = N->getValueType(0);
    7207             : 
    7208             :   if (VT == MVT::v2i16) {
    7209        1655 :     SDValue Lo = N->getOperand(0);
    7210        1655 :     SDValue Hi = N->getOperand(1);
    7211             : 
    7212             :     // v2i16 build_vector (const|undef), (bitcast f16:$x)
    7213             :     // -> bitcast (v2f16 build_vector const|undef, $x
    7214        1655 :     if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
    7215           4 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
    7216           2 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    7217             :     }
    7218             : 
    7219        1653 :     if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
    7220           0 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
    7221           0 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    7222             :     }
    7223             :   }
    7224             : 
    7225       62038 :   return SDValue();
    7226             : }
    7227             : 
    7228         200 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
    7229             :                                           const SDNode *N0,
    7230             :                                           const SDNode *N1) const {
    7231         400 :   EVT VT = N0->getValueType(0);
    7232             : 
    7233             :   // Only do this if we are not trying to support denormals. v_mad_f32 does not
    7234             :   // support denormals ever.
    7235         122 :   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
    7236          32 :       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
    7237             :     return ISD::FMAD;
    7238             : 
    7239          86 :   const TargetOptions &Options = DAG.getTarget().Options;
    7240         176 :   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    7241          51 :        (N0->getFlags().hasAllowContract() &&
    7242         136 :         N1->getFlags().hasAllowContract())) &&
    7243          44 :       isFMAFasterThanFMulAndFAdd(VT)) {
    7244             :     return ISD::FMA;
    7245             :   }
    7246             : 
    7247             :   return 0;
    7248             : }
    7249             : 
    7250          20 : static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
    7251             :                            EVT VT,
    7252             :                            SDValue N0, SDValue N1, SDValue N2,
    7253             :                            bool Signed) {
    7254          20 :   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
    7255          20 :   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
    7256          20 :   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
    7257          20 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
    7258             : }
    7259             : 
    7260      134808 : SDValue SITargetLowering::performAddCombine(SDNode *N,
    7261             :                                             DAGCombinerInfo &DCI) const {
    7262      134808 :   SelectionDAG &DAG = DCI.DAG;
    7263      269616 :   EVT VT = N->getValueType(0);
    7264             :   SDLoc SL(N);
    7265      134808 :   SDValue LHS = N->getOperand(0);
    7266      134808 :   SDValue RHS = N->getOperand(1);
    7267             : 
    7268      134553 :   if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
    7269        1731 :       && Subtarget->hasMad64_32() &&
    7270      135224 :       !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
    7271             :       VT.getScalarSizeInBits() <= 64) {
    7272          24 :     if (LHS.getOpcode() != ISD::MUL)
    7273             :       std::swap(LHS, RHS);
    7274             : 
    7275          24 :     SDValue MulLHS = LHS.getOperand(0);
    7276          24 :     SDValue MulRHS = LHS.getOperand(1);
    7277          24 :     SDValue AddRHS = RHS;
    7278             : 
    7279             :     // TODO: Maybe restrict if SGPR inputs.
    7280          38 :     if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
    7281          14 :         numBitsUnsigned(MulRHS, DAG) <= 32) {
    7282          13 :       MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
    7283          13 :       MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
    7284          13 :       AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
    7285          13 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
    7286             :     }
    7287             : 
    7288          18 :     if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
    7289           7 :       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
    7290           7 :       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
    7291           7 :       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
    7292           7 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
    7293             :     }
    7294             : 
    7295           4 :     return SDValue();
    7296             :   }
    7297             : 
    7298      156833 :   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
    7299      121867 :     return SDValue();
    7300             : 
    7301             :   // add x, zext (setcc) => addcarry x, 0, setcc
    7302             :   // add x, sext (setcc) => subcarry x, 0, setcc
    7303             :   unsigned Opc = LHS.getOpcode();
    7304       25834 :   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
    7305       12917 :       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
    7306             :     std::swap(RHS, LHS);
    7307             : 
    7308             :   Opc = RHS.getOpcode();
    7309       12917 :   switch (Opc) {
    7310             :   default: break;
    7311          78 :   case ISD::ZERO_EXTEND:
    7312             :   case ISD::SIGN_EXTEND:
    7313             :   case ISD::ANY_EXTEND: {
    7314          78 :     auto Cond = RHS.getOperand(0);
    7315             :     if (!isBoolSGPR(Cond))
    7316             :       break;
    7317          20 :     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
    7318          40 :     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
    7319          20 :     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
    7320          20 :     return DAG.getNode(Opc, SL, VTList, Args);
    7321             :   }
    7322           0 :   case ISD::ADDCARRY: {
    7323             :     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
    7324             :     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    7325           0 :     if (!C || C->getZExtValue() != 0) break;
    7326           0 :     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
    7327           0 :     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
    7328             :   }
    7329             :   }
    7330       12897 :   return SDValue();
    7331             : }
    7332             : 
    7333        2503 : SDValue SITargetLowering::performSubCombine(SDNode *N,
    7334             :                                             DAGCombinerInfo &DCI) const {
    7335        2503 :   SelectionDAG &DAG = DCI.DAG;
    7336        2503 :   EVT VT = N->getValueType(0);
    7337             : 
    7338             :   if (VT != MVT::i32)
    7339         613 :     return SDValue();
    7340             : 
    7341             :   SDLoc SL(N);
    7342        1890 :   SDValue LHS = N->getOperand(0);
    7343        1890 :   SDValue RHS = N->getOperand(1);
    7344             : 
    7345             :   unsigned Opc = LHS.getOpcode();
    7346        1890 :   if (Opc != ISD::SUBCARRY)
    7347             :     std::swap(RHS, LHS);
    7348             : 
    7349        1890 :   if (LHS.getOpcode() == ISD::SUBCARRY) {
    7350             :     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
    7351             :     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    7352           4 :     if (!C || C->getZExtValue() != 0)
    7353           0 :       return SDValue();
    7354           2 :     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
    7355           6 :     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
    7356             :   }
    7357        1888 :   return SDValue();
    7358             : }
    7359             : 
    7360         654 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
    7361             :   DAGCombinerInfo &DCI) const {
    7362             : 
    7363         654 :   if (N->getValueType(0) != MVT::i32)
    7364           0 :     return SDValue();
    7365             : 
    7366         654 :   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
    7367         472 :   if (!C || C->getZExtValue() != 0)
    7368         418 :     return SDValue();
    7369             : 
    7370         236 :   SelectionDAG &DAG = DCI.DAG;
    7371         236 :   SDValue LHS = N->getOperand(0);
    7372             : 
    7373             :   // addcarry (add x, y), 0, cc => addcarry x, y, cc
    7374             :   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
    7375             :   unsigned LHSOpc = LHS.getOpcode();
    7376         236 :   unsigned Opc = N->getOpcode();
    7377         472 :   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
    7378         236 :       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
    7379           2 :     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
    7380           6 :     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
    7381             :   }
    7382         234 :   return SDValue();
    7383             : }
    7384             : 
    7385        7783 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
    7386             :                                              DAGCombinerInfo &DCI) const {
    7387        7783 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    7388        5552 :     return SDValue();
    7389             : 
    7390        2231 :   SelectionDAG &DAG = DCI.DAG;
    7391        4462 :   EVT VT = N->getValueType(0);
    7392             : 
    7393             :   SDLoc SL(N);
    7394        2231 :   SDValue LHS = N->getOperand(0);
    7395        2231 :   SDValue RHS = N->getOperand(1);
    7396             : 
    7397             :   // These should really be instruction patterns, but writing patterns with
    7398             :   // source modiifiers is a pain.
    7399             : 
    7400             :   // fadd (fadd (a, a), b) -> mad 2.0, a, b
    7401        2231 :   if (LHS.getOpcode() == ISD::FADD) {
    7402         302 :     SDValue A = LHS.getOperand(0);
    7403             :     if (A == LHS.getOperand(1)) {
    7404         105 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    7405         105 :       if (FusedOp != 0) {
    7406          73 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7407          73 :         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
    7408             :       }
    7409             :     }
    7410             :   }
    7411             : 
    7412             :   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
    7413        2158 :   if (RHS.getOpcode() == ISD::FADD) {
    7414         130 :     SDValue A = RHS.getOperand(0);
    7415             :     if (A == RHS.getOperand(1)) {
    7416          30 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    7417          30 :       if (FusedOp != 0) {
    7418          20 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7419          20 :         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
    7420             :       }
    7421             :     }
    7422             :   }
    7423             : 
    7424        2138 :   return SDValue();
    7425             : }
    7426             : 
    7427        1715 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
    7428             :                                              DAGCombinerInfo &DCI) const {
    7429        1715 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    7430        1194 :     return SDValue();
    7431             : 
    7432         521 :   SelectionDAG &DAG = DCI.DAG;
    7433             :   SDLoc SL(N);
    7434        1042 :   EVT VT = N->getValueType(0);
    7435             :   assert(!VT.isVector());
    7436             : 
    7437             :   // Try to get the fneg to fold into the source modifier. This undoes generic
    7438             :   // DAG combines and folds them into the mad.
    7439             :   //
    7440             :   // Only do this if we are not trying to support denormals. v_mad_f32 does
    7441             :   // not support denormals ever.
    7442         521 :   SDValue LHS = N->getOperand(0);
    7443         521 :   SDValue RHS = N->getOperand(1);
    7444         521 :   if (LHS.getOpcode() == ISD::FADD) {
    7445             :     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
    7446          46 :     SDValue A = LHS.getOperand(0);
    7447             :     if (A == LHS.getOperand(1)) {
    7448          24 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    7449          24 :       if (FusedOp != 0){
    7450          17 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7451          17 :         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    7452             : 
    7453          17 :         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
    7454             :       }
    7455             :     }
    7456             :   }
    7457             : 
    7458         504 :   if (RHS.getOpcode() == ISD::FADD) {
    7459             :     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
    7460             : 
    7461          50 :     SDValue A = RHS.getOperand(0);
    7462             :     if (A == RHS.getOperand(1)) {
    7463          41 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    7464          41 :       if (FusedOp != 0){
    7465          32 :         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
    7466          32 :         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
    7467             :       }
    7468             :     }
    7469             :   }
    7470             : 
    7471         472 :   return SDValue();
    7472             : }
    7473             : 
    7474        9811 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
    7475             :                                               DAGCombinerInfo &DCI) const {
    7476        9811 :   SelectionDAG &DAG = DCI.DAG;
    7477             :   SDLoc SL(N);
    7478             : 
    7479        9811 :   SDValue LHS = N->getOperand(0);
    7480        9811 :   SDValue RHS = N->getOperand(1);
    7481             :   EVT VT = LHS.getValueType();
    7482        9811 :   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
    7483             : 
    7484             :   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
    7485             :   if (!CRHS) {
    7486             :     CRHS = dyn_cast<ConstantSDNode>(LHS);
    7487             :     if (CRHS) {
    7488             :       std::swap(LHS, RHS);
    7489           0 :       CC = getSetCCSwappedOperands(CC);
    7490             :     }
    7491             :   }
    7492             : 
    7493        9811 :   if (CRHS) {
    7494        4162 :     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
    7495          16 :         isBoolSGPR(LHS.getOperand(0))) {
    7496             :       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
    7497             :       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
    7498             :       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
    7499             :       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
    7500           6 :       if ((CRHS->isAllOnesValue() &&
    7501           3 :            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
    7502           0 :           (CRHS->isNullValue() &&
    7503           0 :            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
    7504             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    7505           6 :                            DAG.getConstant(-1, SL, MVT::i1));
    7506           0 :       if ((CRHS->isAllOnesValue() &&
    7507           0 :            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
    7508           0 :           (CRHS->isNullValue() &&
    7509           0 :            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
    7510           0 :         return LHS.getOperand(0);
    7511             :     }
    7512             : 
    7513        5315 :     uint64_t CRHSVal = CRHS->getZExtValue();
    7514        9051 :     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
    7515             :         LHS.getOpcode() == ISD::SELECT &&
    7516             :         isa<ConstantSDNode>(LHS.getOperand(1)) &&
    7517         148 :         isa<ConstantSDNode>(LHS.getOperand(2)) &&
    7518        5315 :         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
    7519         148 :         isBoolSGPR(LHS.getOperand(0))) {
    7520             :       // Given CT != FT:
    7521             :       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
    7522             :       // setcc (select cc, CT, CF), CF, ne => cc
    7523             :       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
    7524             :       // setcc (select cc, CT, CF), CT, eq => cc
    7525             :       uint64_t CT = LHS.getConstantOperandVal(1);
    7526             :       uint64_t CF = LHS.getConstantOperandVal(2);
    7527             : 
    7528         153 :       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
    7529           5 :           (CT == CRHSVal && CC == ISD::SETNE))
    7530             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    7531         288 :                            DAG.getConstant(-1, SL, MVT::i1));
    7532           7 :       if ((CF == CRHSVal && CC == ISD::SETNE) ||
    7533           3 :           (CT == CRHSVal && CC == ISD::SETEQ))
    7534           2 :         return LHS.getOperand(0);
    7535             :     }
    7536             :   }
    7537             : 
    7538        7916 :   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
    7539             :                                            VT != MVT::f16))
    7540        3120 :     return SDValue();
    7541             : 
    7542             :   // Match isinf pattern
    7543             :   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
    7544        6695 :   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
    7545             :     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    7546             :     if (!CRHS)
    7547           0 :       return SDValue();
    7548             : 
    7549           2 :     const APFloat &APF = CRHS->getValueAPF();
    7550           4 :     if (APF.isInfinity() && !APF.isNegative()) {
    7551             :       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
    7552             :       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
    7553           4 :                          DAG.getConstant(Mask, SL, MVT::i32));
    7554             :     }
    7555             :   }
    7556             : 
    7557        6540 :   return SDValue();
    7558             : }
    7559             : 
    7560         363 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
    7561             :                                                      DAGCombinerInfo &DCI) const {
    7562         363 :   SelectionDAG &DAG = DCI.DAG;
    7563             :   SDLoc SL(N);
    7564         726 :   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
    7565             : 
    7566         363 :   SDValue Src = N->getOperand(0);
    7567         363 :   SDValue Srl = N->getOperand(0);
    7568         363 :   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
    7569          52 :     Srl = Srl.getOperand(0);
    7570             : 
    7571             :   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
    7572         363 :   if (Srl.getOpcode() == ISD::SRL) {
    7573             :     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
    7574             :     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
    7575             :     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
    7576             : 
    7577             :     if (const ConstantSDNode *C =
    7578             :         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
    7579         118 :       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
    7580          59 :                                EVT(MVT::i32));
    7581             : 
    7582         118 :       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
    7583          59 :       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
    7584          59 :         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
    7585          59 :                            MVT::f32, Srl);
    7586             :       }
    7587             :     }
    7588             :   }
    7589             : 
    7590         304 :   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
    7591             : 
    7592         304 :   KnownBits Known;
    7593         304 :   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    7594         608 :                                         !DCI.isBeforeLegalizeOps());
    7595         304 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    7596         608 :   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
    7597         304 :       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
    7598          96 :     DCI.CommitTargetLoweringOpt(TLO);
    7599             :   }
    7600             : 
    7601         304 :   return SDValue();
    7602             : }
    7603             : 
    7604         327 : SDValue SITargetLowering::performClampCombine(SDNode *N,
    7605             :                                               DAGCombinerInfo &DCI) const {
    7606         327 :   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
    7607             :   if (!CSrc)
    7608         302 :     return SDValue();
    7609             : 
    7610          25 :   const APFloat &F = CSrc->getValueAPF();
    7611          25 :   APFloat Zero = APFloat::getZero(F.getSemantics());
    7612          25 :   APFloat::cmpResult Cmp0 = F.compare(Zero);
    7613          25 :   if (Cmp0 == APFloat::cmpLessThan ||
    7614          12 :       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
    7615          27 :     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
    7616             :   }
    7617             : 
    7618          16 :   APFloat One(F.getSemantics(), "1.0");
    7619          16 :   APFloat::cmpResult Cmp1 = F.compare(One);
    7620          16 :   if (Cmp1 == APFloat::cmpGreaterThan)
    7621           9 :     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
    7622             : 
    7623          13 :   return SDValue(CSrc, 0);
    7624             : }
    7625             : 
    7626             : 
    7627     1339219 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    7628             :                                             DAGCombinerInfo &DCI) const {
    7629     2678438 :   switch (N->getOpcode()) {
    7630      330167 :   default:
    7631      330167 :     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7632      134808 :   case ISD::ADD:
    7633      134808 :     return performAddCombine(N, DCI);
    7634        2503 :   case ISD::SUB:
    7635        2503 :     return performSubCombine(N, DCI);
    7636         654 :   case ISD::ADDCARRY:
    7637             :   case ISD::SUBCARRY:
    7638         654 :     return performAddCarrySubCarryCombine(N, DCI);
    7639        7783 :   case ISD::FADD:
    7640        7783 :     return performFAddCombine(N, DCI);
    7641        1715 :   case ISD::FSUB:
    7642        1715 :     return performFSubCombine(N, DCI);
    7643        9811 :   case ISD::SETCC:
    7644        9811 :     return performSetCCCombine(N, DCI);
    7645        9227 :   case ISD::FMAXNUM:
    7646             :   case ISD::FMINNUM:
    7647             :   case ISD::SMAX:
    7648             :   case ISD::SMIN:
    7649             :   case ISD::UMAX:
    7650             :   case ISD::UMIN:
    7651             :   case AMDGPUISD::FMIN_LEGACY:
    7652             :   case AMDGPUISD::FMAX_LEGACY: {
    7653       12562 :     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
    7654        3335 :         getTargetMachine().getOptLevel() > CodeGenOpt::None)
    7655        3335 :       return performMinMaxCombine(N, DCI);
    7656             :     break;
    7657             :   }
    7658             :   case ISD::LOAD: {
    7659      231004 :     if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
    7660         478 :       return Widended;
    7661             :     LLVM_FALLTHROUGH;
    7662             :   }
    7663             :   case ISD::STORE:
    7664             :   case ISD::ATOMIC_LOAD:
    7665             :   case ISD::ATOMIC_STORE:
    7666             :   case ISD::ATOMIC_CMP_SWAP:
    7667             :   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
    7668             :   case ISD::ATOMIC_SWAP:
    7669             :   case ISD::ATOMIC_LOAD_ADD:
    7670             :   case ISD::ATOMIC_LOAD_SUB:
    7671             :   case ISD::ATOMIC_LOAD_AND:
    7672             :   case ISD::ATOMIC_LOAD_OR:
    7673             :   case ISD::ATOMIC_LOAD_XOR:
    7674             :   case ISD::ATOMIC_LOAD_NAND:
    7675             :   case ISD::ATOMIC_LOAD_MIN:
    7676             :   case ISD::ATOMIC_LOAD_MAX:
    7677             :   case ISD::ATOMIC_LOAD_UMIN:
    7678             :   case ISD::ATOMIC_LOAD_UMAX:
    7679             :   case AMDGPUISD::ATOMIC_INC:
    7680             :   case AMDGPUISD::ATOMIC_DEC:
    7681             :   case AMDGPUISD::ATOMIC_LOAD_FADD:
    7682             :   case AMDGPUISD::ATOMIC_LOAD_FMIN:
    7683             :   case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
    7684      518652 :     if (DCI.isBeforeLegalize())
    7685             :       break;
    7686      356516 :     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
    7687       30911 :   case ISD::AND:
    7688       30911 :     return performAndCombine(N, DCI);
    7689       16697 :   case ISD::OR:
    7690       16697 :     return performOrCombine(N, DCI);
    7691        1484 :   case ISD::XOR:
    7692        1484 :     return performXorCombine(N, DCI);
    7693       18492 :   case ISD::ZERO_EXTEND:
    7694       18492 :     return performZeroExtendCombine(N, DCI);
    7695          85 :   case AMDGPUISD::FP_CLASS:
    7696          85 :     return performClassCombine(N, DCI);
    7697         412 :   case ISD::FCANONICALIZE:
    7698         412 :     return performFCanonicalizeCombine(N, DCI);
    7699         781 :   case AMDGPUISD::FRACT:
    7700             :   case AMDGPUISD::RCP:
    7701             :   case AMDGPUISD::RSQ:
    7702             :   case AMDGPUISD::RCP_LEGACY:
    7703             :   case AMDGPUISD::RSQ_LEGACY:
    7704             :   case AMDGPUISD::RSQ_CLAMP:
    7705             :   case AMDGPUISD::LDEXP: {
    7706         781 :     SDValue Src = N->getOperand(0);
    7707         781 :     if (Src.isUndef())
    7708          11 :       return Src;
    7709             :     break;
    7710             :   }
    7711        1124 :   case ISD::SINT_TO_FP:
    7712             :   case ISD::UINT_TO_FP:
    7713        1124 :     return performUCharToFloatCombine(N, DCI);
    7714         363 :   case AMDGPUISD::CVT_F32_UBYTE0:
    7715             :   case AMDGPUISD::CVT_F32_UBYTE1:
    7716             :   case AMDGPUISD::CVT_F32_UBYTE2:
    7717             :   case AMDGPUISD::CVT_F32_UBYTE3:
    7718         363 :     return performCvtF32UByteNCombine(N, DCI);
    7719         107 :   case AMDGPUISD::FMED3:
    7720         107 :     return performFMed3Combine(N, DCI);
    7721         139 :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    7722         139 :     return performCvtPkRTZCombine(N, DCI);
    7723         327 :   case AMDGPUISD::CLAMP:
    7724         327 :     return performClampCombine(N, DCI);
    7725         929 :   case ISD::SCALAR_TO_VECTOR: {
    7726         929 :     SelectionDAG &DAG = DCI.DAG;
    7727        1858 :     EVT VT = N->getValueType(0);
    7728             : 
    7729             :     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
    7730             :     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
    7731             :       SDLoc SL(N);
    7732         118 :       SDValue Src = N->getOperand(0);
    7733             :       EVT EltVT = Src.getValueType();
    7734             :       if (EltVT == MVT::f16)
    7735          35 :         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
    7736             : 
    7737         118 :       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
    7738         118 :       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
    7739             :     }
    7740             : 
    7741         811 :     break;
    7742             :   }
    7743      138581 :   case ISD::EXTRACT_VECTOR_ELT:
    7744      138581 :     return performExtractVectorEltCombine(N, DCI);
    7745      112989 :   case ISD::BUILD_VECTOR:
    7746      112989 :     return performBuildVectorCombine(N, DCI);
    7747             :   }
    7748      169609 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7749             : }
    7750             : 
    7751             : /// Helper function for adjustWritemask
    7752             : static unsigned SubIdx2Lane(unsigned Idx) {
    7753             :   switch (Idx) {
    7754             :   default: return 0;
    7755             :   case AMDGPU::sub0: return 0;
    7756             :   case AMDGPU::sub1: return 1;
    7757             :   case AMDGPU::sub2: return 2;
    7758             :   case AMDGPU::sub3: return 3;
    7759             :   }
    7760             : }
    7761             : 
    7762             : /// Adjust the writemask of MIMG instructions
    7763         583 : SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    7764             :                                           SelectionDAG &DAG) const {
    7765         583 :   SDNode *Users[4] = { nullptr };
    7766             :   unsigned Lane = 0;
    7767        1749 :   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
    7768         583 :   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    7769             :   unsigned NewDmask = 0;
    7770             :   bool HasChain = Node->getNumValues() > 1;
    7771             : 
    7772         583 :   if (OldDmask == 0) {
    7773             :     // These are folded out, but on the chance it happens don't assert.
    7774             :     return Node;
    7775             :   }
    7776             : 
    7777             :   // Try to figure out the used register components
    7778         583 :   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    7779        1801 :        I != E; ++I) {
    7780             : 
    7781             :     // Don't look at users of the chain.
    7782        1466 :     if (I.getUse().getResNo() != 0)
    7783         150 :       continue;
    7784             : 
    7785             :     // Abort if we can't understand the usage
    7786        1316 :     if (!I->isMachineOpcode() ||
    7787             :         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    7788             :       return Node;
    7789             : 
    7790             :     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
    7791             :     // Note that subregs are packed, i.e. Lane==0 is the first bit set
    7792             :     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
    7793             :     // set, etc.
    7794        1070 :     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    7795             : 
    7796             :     // Set which texture component corresponds to the lane.
    7797             :     unsigned Comp;
    7798        6016 :     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
    7799        2473 :       Comp = countTrailingZeros(Dmask);
    7800        2473 :       Dmask &= ~(1 << Comp);
    7801             :     }
    7802             : 
    7803             :     // Abort if we have more than one user per component
    7804        1070 :     if (Users[Lane])
    7805             :       return Node;
    7806             : 
    7807        1068 :     Users[Lane] = *I;
    7808        1068 :     NewDmask |= 1 << Comp;
    7809             :   }
    7810             : 
    7811             :   // Abort if there's no change
    7812         335 :   if (NewDmask == OldDmask)
    7813             :     return Node;
    7814             : 
    7815             :   unsigned BitsSet = countPopulation(NewDmask);
    7816             : 
    7817          78 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7818         156 :   int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
    7819          78 :                                           Node->getMachineOpcode(), BitsSet);
    7820             :   assert(NewOpcode != -1 &&
    7821             :          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
    7822             :          "failed to find equivalent MIMG op");
    7823             : 
    7824             :   // Adjust the writemask in the node
    7825             :   SmallVector<SDValue, 12> Ops;
    7826          78 :   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
    7827         312 :   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
    7828         234 :   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
    7829             : 
    7830         156 :   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
    7831             : 
    7832             :   MVT ResultVT = BitsSet == 1 ?
    7833          78 :     SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
    7834             :   SDVTList NewVTList = HasChain ?
    7835         156 :     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
    7836             : 
    7837             : 
    7838         234 :   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
    7839          78 :                                               NewVTList, Ops);
    7840             : 
    7841          78 :   if (HasChain) {
    7842             :     // Update chain.
    7843          75 :     NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
    7844          75 :     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
    7845             :   }
    7846             : 
    7847          78 :   if (BitsSet == 1) {
    7848             :     assert(Node->hasNUsesOfValue(1, 0));
    7849          88 :     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
    7850         132 :                                       SDLoc(Node), Users[Lane]->getValueType(0),
    7851          44 :                                       SDValue(NewNode, 0));
    7852          44 :     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    7853          44 :     return nullptr;
    7854             :   }
    7855             : 
    7856             :   // Update the users of the node with the new indices
    7857         306 :   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    7858         136 :     SDNode *User = Users[i];
    7859         136 :     if (!User)
    7860          48 :       continue;
    7861             : 
    7862         264 :     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
    7863          88 :     DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
    7864             : 
    7865          88 :     switch (Idx) {
    7866             :     default: break;
    7867          34 :     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    7868          34 :     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    7869          20 :     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    7870             :     }
    7871             :   }
    7872             : 
    7873          34 :   DAG.RemoveDeadNode(Node);
    7874          34 :   return nullptr;
    7875             : }
    7876             : 
    7877             : static bool isFrameIndexOp(SDValue Op) {
    7878      330172 :   if (Op.getOpcode() == ISD::AssertZext)
    7879          64 :     Op = Op.getOperand(0);
    7880             : 
    7881             :   return isa<FrameIndexSDNode>(Op);
    7882             : }
    7883             : 
    7884             : /// Legalize target independent instructions (e.g. INSERT_SUBREG)
    7885             : /// with frame index operands.
    7886             : /// LLVM assumes that inputs are to these instructions are registers.
    7887       53786 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
    7888             :                                                         SelectionDAG &DAG) const {
    7889       53786 :   if (Node->getOpcode() == ISD::CopyToReg) {
    7890       12074 :     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
    7891       12074 :     SDValue SrcVal = Node->getOperand(2);
    7892             : 
    7893             :     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
    7894             :     // to try understanding copies to physical registers.
    7895         167 :     if (SrcVal.getValueType() == MVT::i1 &&
    7896         167 :         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
    7897             :       SDLoc SL(Node);
    7898           8 :       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7899             :       SDValue VReg = DAG.getRegister(
    7900           8 :         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
    7901             : 
    7902             :       SDNode *Glued = Node->getGluedNode();
    7903             :       SDValue ToVReg
    7904           8 :         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
    7905          12 :                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
    7906             :       SDValue ToResultReg
    7907             :         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
    7908          16 :                            VReg, ToVReg.getValue(1));
    7909           8 :       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
    7910           8 :       DAG.RemoveDeadNode(Node);
    7911             :       return ToResultReg.getNode();
    7912             :     }
    7913             :   }
    7914             : 
    7915             :   SmallVector<SDValue, 8> Ops;
    7916     1098072 :   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
    7917      990494 :     if (!isFrameIndexOp(Node->getOperand(i))) {
    7918      330150 :       Ops.push_back(Node->getOperand(i));
    7919      330150 :       continue;
    7920             :     }
    7921             : 
    7922             :     SDLoc DL(Node);
    7923          66 :     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
    7924             :                                      Node->getOperand(i).getValueType(),
    7925          22 :                                      Node->getOperand(i)), 0));
    7926             :   }
    7927             : 
    7928       53778 :   return DAG.UpdateNodeOperands(Node, Ops);
    7929             : }
    7930             : 
    7931             : /// Fold the instructions after selecting them.
    7932             : /// Returns null if users were already updated.
    7933      363046 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
    7934             :                                           SelectionDAG &DAG) const {
    7935      363046 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7936      363046 :   unsigned Opcode = Node->getMachineOpcode();
    7937             : 
    7938      365607 :   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
    7939      363686 :       !TII->isGather4(Opcode) && !TII->isD16(Opcode)) {
    7940         583 :     return adjustWritemask(Node, DAG);
    7941             :   }
    7942             : 
    7943      724926 :   if (Opcode == AMDGPU::INSERT_SUBREG ||
    7944      362463 :       Opcode == AMDGPU::REG_SEQUENCE) {
    7945       41712 :     legalizeTargetIndependentNode(Node, DAG);
    7946       41712 :     return Node;
    7947             :   }
    7948             : 
    7949      320751 :   switch (Opcode) {
    7950         273 :   case AMDGPU::V_DIV_SCALE_F32:
    7951             :   case AMDGPU::V_DIV_SCALE_F64: {
    7952             :     // Satisfy the operand register constraint when one of the inputs is
    7953             :     // undefined. Ordinarily each undef value will have its own implicit_def of
    7954             :     // a vreg, so force these to use a single register.
    7955         273 :     SDValue Src0 = Node->getOperand(0);
    7956         273 :     SDValue Src1 = Node->getOperand(1);
    7957         273 :     SDValue Src2 = Node->getOperand(2);
    7958             : 
    7959         270 :     if ((Src0.isMachineOpcode() &&
    7960         273 :          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
    7961             :         (Src0 == Src1 || Src0 == Src2))
    7962             :       break;
    7963             : 
    7964             :     MVT VT = Src0.getValueType().getSimpleVT();
    7965           6 :     const TargetRegisterClass *RC = getRegClassFor(VT);
    7966             : 
    7967           6 :     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7968           6 :     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
    7969             : 
    7970          12 :     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
    7971          18 :                                       UndefReg, Src0, SDValue());
    7972             : 
    7973             :     // src0 must be the same register as src1 or src2, even if the value is
    7974             :     // undefined, so make sure we don't violate this constraint.
    7975           6 :     if (Src0.isMachineOpcode() &&
    7976             :         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
    7977           3 :       if (Src1.isMachineOpcode() &&
    7978             :           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7979             :         Src0 = Src1;
    7980           3 :       else if (Src2.isMachineOpcode() &&
    7981             :                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7982             :         Src0 = Src2;
    7983             :       else {
    7984             :         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
    7985           1 :         Src0 = UndefReg;
    7986             :         Src1 = UndefReg;
    7987             :       }
    7988             :     } else
    7989             :       break;
    7990             : 
    7991           6 :     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
    7992           6 :     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
    7993           0 :       Ops.push_back(Node->getOperand(I));
    7994             : 
    7995           3 :     Ops.push_back(ImpDef.getValue(1));
    7996           9 :     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    7997             :   }
    7998             :   default:
    7999             :     break;
    8000             :   }
    8001             : 
    8002      320748 :   return Node;
    8003             : }
    8004             : 
    8005             : /// Assign the register class depending on the number of
    8006             : /// bits set in the writemask
    8007       32519 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
    8008             :                                                      SDNode *Node) const {
    8009       32519 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8010             : 
    8011       32519 :   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    8012             : 
    8013       97557 :   if (TII->isVOP3(MI.getOpcode())) {
    8014             :     // Make sure constant bus requirements are respected.
    8015       30184 :     TII->legalizeOperandsVOP3(MRI, MI);
    8016       30184 :     return;
    8017             :   }
    8018             : 
    8019             :   // Replace unused atomics with the no return version.
    8020        2335 :   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
    8021        2335 :   if (NoRetAtomicOp != -1) {
    8022        1840 :     if (!Node->hasAnyUseOfValue(0)) {
    8023         946 :       MI.setDesc(TII->get(NoRetAtomicOp));
    8024         946 :       MI.RemoveOperand(0);
    8025         946 :       return;
    8026             :     }
    8027             : 
    8028             :     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
    8029             :     // instruction, because the return type of these instructions is a vec2 of
    8030             :     // the memory type, so it can be tied to the input operand.
    8031             :     // This means these instructions always have a use, so we need to add a
    8032             :     // special case to check if the atomic has only one extract_subreg use,
    8033             :     // which itself has no uses.
    8034        1786 :     if ((Node->hasNUsesOfValue(1, 0) &&
    8035        2652 :          Node->use_begin()->isMachineOpcode() &&
    8036         906 :          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
    8037          12 :          !Node->use_begin()->hasAnyUseOfValue(0))) {
    8038           0 :       unsigned Def = MI.getOperand(0).getReg();
    8039             : 
    8040             :       // Change this into a noret atomic.
    8041           0 :       MI.setDesc(TII->get(NoRetAtomicOp));
    8042           0 :       MI.RemoveOperand(0);
    8043             : 
    8044             :       // If we only remove the def operand from the atomic instruction, the
    8045             :       // extract_subreg will be left with a use of a vreg without a def.
    8046             :       // So we need to insert an implicit_def to avoid machine verifier
    8047             :       // errors.
    8048           0 :       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
    8049           0 :               TII->get(AMDGPU::IMPLICIT_DEF), Def);
    8050             :     }
    8051             :     return;
    8052             :   }
    8053             : }
    8054             : 
    8055       42568 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
    8056             :                               uint64_t Val) {
    8057       42568 :   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
    8058       42568 :   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
    8059             : }
    8060             : 
    8061        4139 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
    8062             :                                                 const SDLoc &DL,
    8063             :                                                 SDValue Ptr) const {
    8064        4139 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8065             : 
    8066             :   // Build the half of the subregister with the constants before building the
    8067             :   // full 128-bit register. If we are building multiple resource descriptors,
    8068             :   // this will allow CSEing of the 2-component register.
    8069             :   const SDValue Ops0[] = {
    8070             :     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
    8071             :     buildSMovImm32(DAG, DL, 0),
    8072             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    8073        4139 :     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
    8074             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
    8075       16556 :   };
    8076             : 
    8077        4139 :   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
    8078             :                                                 MVT::v2i32, Ops0), 0);
    8079             : 
    8080             :   // Combine the constants and the pointer.
    8081             :   const SDValue Ops1[] = {
    8082             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    8083             :     Ptr,
    8084             :     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
    8085             :     SubRegHi,
    8086             :     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
    8087       12417 :   };
    8088             : 
    8089        4139 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
    8090             : }
    8091             : 
    8092             : /// Return a resource descriptor with the 'Add TID' bit enabled
    8093             : ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
    8094             : ///        of the resource descriptor) to create an offset, which is added to
    8095             : ///        the resource pointer.
    8096       17145 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
    8097             :                                            SDValue Ptr, uint32_t RsrcDword1,
    8098             :                                            uint64_t RsrcDword2And3) const {
    8099       17145 :   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
    8100       17145 :   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
    8101       17145 :   if (RsrcDword1) {
    8102           0 :     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
    8103             :                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
    8104             :                     0);
    8105             :   }
    8106             : 
    8107             :   SDValue DataLo = buildSMovImm32(DAG, DL,
    8108       17145 :                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
    8109       17145 :   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
    8110             : 
    8111             :   const SDValue Ops[] = {
    8112             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    8113             :     PtrLo,
    8114             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    8115             :     PtrHi,
    8116             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
    8117             :     DataLo,
    8118             :     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
    8119             :     DataHi,
    8120             :     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
    8121       85725 :   };
    8122             : 
    8123       17145 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
    8124             : }
    8125             : 
    8126             : //===----------------------------------------------------------------------===//
    8127             : //                         SI Inline Assembly Support
    8128             : //===----------------------------------------------------------------------===//
    8129             : 
    8130             : std::pair<unsigned, const TargetRegisterClass *>
    8131        2090 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    8132             :                                                StringRef Constraint,
    8133             :                                                MVT VT) const {
    8134             :   const TargetRegisterClass *RC = nullptr;
    8135        2090 :   if (Constraint.size() == 1) {
    8136        1248 :     switch (Constraint[0]) {
    8137           0 :     default:
    8138           0 :       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    8139         341 :     case 's':
    8140             :     case 'r':
    8141         341 :       switch (VT.getSizeInBits()) {
    8142           6 :       default:
    8143           6 :         return std::make_pair(0U, nullptr);
    8144             :       case 32:
    8145             :       case 16:
    8146             :         RC = &AMDGPU::SReg_32_XM0RegClass;
    8147             :         break;
    8148          73 :       case 64:
    8149             :         RC = &AMDGPU::SGPR_64RegClass;
    8150          73 :         break;
    8151          18 :       case 128:
    8152             :         RC = &AMDGPU::SReg_128RegClass;
    8153          18 :         break;
    8154          48 :       case 256:
    8155             :         RC = &AMDGPU::SReg_256RegClass;
    8156          48 :         break;
    8157          32 :       case 512:
    8158             :         RC = &AMDGPU::SReg_512RegClass;
    8159          32 :         break;
    8160             :       }
    8161             :       break;
    8162         283 :     case 'v':
    8163         283 :       switch (VT.getSizeInBits()) {
    8164           6 :       default:
    8165           6 :         return std::make_pair(0U, nullptr);
    8166             :       case 32:
    8167             :       case 16:
    8168             :         RC = &AMDGPU::VGPR_32RegClass;
    8169             :         break;
    8170          42 :       case 64:
    8171             :         RC = &AMDGPU::VReg_64RegClass;
    8172          42 :         break;
    8173           0 :       case 96:
    8174             :         RC = &AMDGPU::VReg_96RegClass;
    8175           0 :         break;
    8176          23 :       case 128:
    8177             :         RC = &AMDGPU::VReg_128RegClass;
    8178          23 :         break;
    8179           0 :       case 256:
    8180             :         RC = &AMDGPU::VReg_256RegClass;
    8181           0 :         break;
    8182           0 :       case 512:
    8183             :         RC = &AMDGPU::VReg_512RegClass;
    8184           0 :         break;
    8185             :       }
    8186             :       break;
    8187             :     }
    8188             :     // We actually support i128, i16 and f16 as inline parameters
    8189             :     // even if they are not reported as legal
    8190          38 :     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
    8191          26 :                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
    8192             :       return std::make_pair(0U, RC);
    8193             :   }
    8194             : 
    8195        1484 :   if (Constraint.size() > 1) {
    8196        2932 :     if (Constraint[1] == 'v') {
    8197             :       RC = &AMDGPU::VGPR_32RegClass;
    8198         722 :     } else if (Constraint[1] == 's') {
    8199             :       RC = &AMDGPU::SGPR_32RegClass;
    8200             :     }
    8201             : 
    8202        1466 :     if (RC) {
    8203             :       uint32_t Idx;
    8204        2600 :       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
    8205        1300 :       if (!Failed && Idx < RC->getNumRegs())
    8206             :         return std::make_pair(RC->getRegister(Idx), RC);
    8207             :     }
    8208             :   }
    8209        1484 :   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    8210             : }
    8211             : 
    8212             : SITargetLowering::ConstraintType
    8213        7005 : SITargetLowering::getConstraintType(StringRef Constraint) const {
    8214        7005 :   if (Constraint.size() == 1) {
    8215        5114 :     switch (Constraint[0]) {
    8216             :     default: break;
    8217             :     case 's':
    8218             :     case 'v':
    8219             :       return C_RegisterClass;
    8220             :     }
    8221             :   }
    8222        4632 :   return TargetLowering::getConstraintType(Constraint);
    8223             : }
    8224             : 
    8225             : // Figure out which registers should be reserved for stack access. Only after
    8226             : // the function is legalized do we know all of the non-spill stack objects or if
    8227             : // calls are present.
    8228       17860 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    8229       17860 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    8230       17860 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    8231       17860 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
    8232       17860 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    8233             :   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    8234             : 
    8235       17860 :   if (Info->isEntryFunction()) {
    8236             :     // Callable functions have fixed registers used for stack access.
    8237       16459 :     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
    8238             :   }
    8239             : 
    8240             :   // We have to assume the SP is needed in case there are calls in the function
    8241             :   // during lowering. Calls are only detected after the function is
    8242             :   // lowered. We're about to reserve registers, so don't bother using it if we
    8243             :   // aren't really going to use it.
    8244       34319 :   bool NeedSP = !Info->isEntryFunction() ||
    8245       34317 :     MFI.hasVarSizedObjects() ||
    8246       16457 :     MFI.hasCalls();
    8247             : 
    8248             :   if (NeedSP) {
    8249        1748 :     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
    8250             :     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
    8251             : 
    8252             :     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
    8253             :     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
    8254             :                                Info->getStackPtrOffsetReg()));
    8255        1748 :     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
    8256             :   }
    8257             : 
    8258       17860 :   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
    8259       17860 :   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
    8260       17860 :   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
    8261             :                      Info->getScratchWaveOffsetReg());
    8262             : 
    8263       17860 :   Info->limitOccupancy(MF);
    8264             : 
    8265       17860 :   TargetLoweringBase::finalizeLowering(MF);
    8266       17860 : }
    8267             : 
    8268      451168 : void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
    8269             :                                                      KnownBits &Known,
    8270             :                                                      const APInt &DemandedElts,
    8271             :                                                      const SelectionDAG &DAG,
    8272             :                                                      unsigned Depth) const {
    8273      451168 :   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
    8274             :                                                 DAG, Depth);
    8275             : 
    8276      451168 :   if (getSubtarget()->enableHugePrivateBuffer())
    8277             :     return;
    8278             : 
    8279             :   // Technically it may be possible to have a dispatch with a single workitem
    8280             :   // that uses the full private memory size, but that's not really useful. We
    8281             :   // can't use vaddr in MUBUF instructions if we don't know the address
    8282             :   // calculation won't overflow, so assume the sign bit is never set.
    8283      451160 :   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
    8284             : }
    8285             : 
    8286     4258290 : bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
    8287             :   FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
    8288             : {
    8289     8516580 :   switch (N->getOpcode()) {
    8290      420840 :     case ISD::Register:
    8291             :     case ISD::CopyFromReg:
    8292             :     {
    8293             :       const RegisterSDNode *R = nullptr;
    8294      420840 :       if (N->getOpcode() == ISD::Register) {
    8295             :         R = dyn_cast<RegisterSDNode>(N);
    8296             :       }
    8297             :       else {
    8298      177541 :         R = dyn_cast<RegisterSDNode>(N->getOperand(1));
    8299             :       }
    8300      420840 :       if (R)
    8301             :       {
    8302      420840 :         const MachineFunction * MF = FLI->MF;
    8303      420840 :         const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
    8304      420840 :         const MachineRegisterInfo &MRI = MF->getRegInfo();
    8305             :         const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
    8306      420840 :         unsigned Reg = R->getReg();
    8307      420840 :         if (TRI.isPhysicalRegister(Reg))
    8308       50061 :           return TRI.isVGPR(MRI, Reg);
    8309             : 
    8310      370779 :         if (MRI.isLiveIn(Reg)) {
    8311             :           // workitem.id.x workitem.id.y workitem.id.z
    8312             :           // Any VGPR formal argument is also considered divergent
    8313      283768 :           if (TRI.isVGPR(MRI, Reg))
    8314             :               return true;
    8315             :           // Formal arguments of non-entry functions
    8316             :           // are conservatively considered divergent
    8317      418652 :           else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
    8318             :             return true;
    8319             :         }
    8320      280847 :         return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
    8321           0 :       }
    8322             :     }
    8323             :     break;
    8324             :     case ISD::LOAD: {
    8325             :       const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
    8326      655090 :       if (L->getMemOperand()->getAddrSpace() ==
    8327      327545 :           Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
    8328      316781 :         return true;
    8329             :     } break;
    8330             :     case ISD::CALLSEQ_END:
    8331             :     return true;
    8332             :     break;
    8333       18804 :     case ISD::INTRINSIC_WO_CHAIN:
    8334             :     {
    8335             : 
    8336             :     }
    8337       18804 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    8338       56412 :       cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
    8339        6245 :     case ISD::INTRINSIC_W_CHAIN:
    8340        6245 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    8341       18735 :       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
    8342             :     // In some cases intrinsics that are a source of divergence have been
    8343             :     // lowered to AMDGPUISD so we also need to check those too.
    8344             :     case AMDGPUISD::INTERP_MOV:
    8345             :     case AMDGPUISD::INTERP_P1:
    8346             :     case AMDGPUISD::INTERP_P2:
    8347             :       return true;
    8348             :   }
    8349             :   return false;
    8350      303507 : }

Generated by: LCOV version 1.13