LCOV - llvm-toolchain.info - lib/Target/AMDGPU/SIISelLowering.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AMDGPU - SIISelLowering.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	3425	3754	91.2 %
Date:	2018-10-20 13:21:21	Functions:	157	173	90.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Custom DAG lowering for SI
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifdef _MSC_VER
      16             : // Provide M_PI.
      17             : #define _USE_MATH_DEFINES
      18             : #endif
      19             : 
      20             : #include "SIISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUIntrinsicInfo.h"
      23             : #include "AMDGPUSubtarget.h"
      24             : #include "AMDGPUTargetMachine.h"
      25             : #include "SIDefines.h"
      26             : #include "SIInstrInfo.h"
      27             : #include "SIMachineFunctionInfo.h"
      28             : #include "SIRegisterInfo.h"
      29             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      30             : #include "Utils/AMDGPUBaseInfo.h"
      31             : #include "llvm/ADT/APFloat.h"
      32             : #include "llvm/ADT/APInt.h"
      33             : #include "llvm/ADT/ArrayRef.h"
      34             : #include "llvm/ADT/BitVector.h"
      35             : #include "llvm/ADT/SmallVector.h"
      36             : #include "llvm/ADT/Statistic.h"
      37             : #include "llvm/ADT/StringRef.h"
      38             : #include "llvm/ADT/StringSwitch.h"
      39             : #include "llvm/ADT/Twine.h"
      40             : #include "llvm/CodeGen/Analysis.h"
      41             : #include "llvm/CodeGen/CallingConvLower.h"
      42             : #include "llvm/CodeGen/DAGCombine.h"
      43             : #include "llvm/CodeGen/ISDOpcodes.h"
      44             : #include "llvm/CodeGen/MachineBasicBlock.h"
      45             : #include "llvm/CodeGen/MachineFrameInfo.h"
      46             : #include "llvm/CodeGen/MachineFunction.h"
      47             : #include "llvm/CodeGen/MachineInstr.h"
      48             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      49             : #include "llvm/CodeGen/MachineMemOperand.h"
      50             : #include "llvm/CodeGen/MachineModuleInfo.h"
      51             : #include "llvm/CodeGen/MachineOperand.h"
      52             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      53             : #include "llvm/CodeGen/SelectionDAG.h"
      54             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      55             : #include "llvm/CodeGen/TargetCallingConv.h"
      56             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      57             : #include "llvm/CodeGen/ValueTypes.h"
      58             : #include "llvm/IR/Constants.h"
      59             : #include "llvm/IR/DataLayout.h"
      60             : #include "llvm/IR/DebugLoc.h"
      61             : #include "llvm/IR/DerivedTypes.h"
      62             : #include "llvm/IR/DiagnosticInfo.h"
      63             : #include "llvm/IR/Function.h"
      64             : #include "llvm/IR/GlobalValue.h"
      65             : #include "llvm/IR/InstrTypes.h"
      66             : #include "llvm/IR/Instruction.h"
      67             : #include "llvm/IR/Instructions.h"
      68             : #include "llvm/IR/IntrinsicInst.h"
      69             : #include "llvm/IR/Type.h"
      70             : #include "llvm/Support/Casting.h"
      71             : #include "llvm/Support/CodeGen.h"
      72             : #include "llvm/Support/CommandLine.h"
      73             : #include "llvm/Support/Compiler.h"
      74             : #include "llvm/Support/ErrorHandling.h"
      75             : #include "llvm/Support/KnownBits.h"
      76             : #include "llvm/Support/MachineValueType.h"
      77             : #include "llvm/Support/MathExtras.h"
      78             : #include "llvm/Target/TargetOptions.h"
      79             : #include <cassert>
      80             : #include <cmath>
      81             : #include <cstdint>
      82             : #include <iterator>
      83             : #include <tuple>
      84             : #include <utility>
      85             : #include <vector>
      86             : 
      87             : using namespace llvm;
      88             : 
      89             : #define DEBUG_TYPE "si-lower"
      90             : 
      91             : STATISTIC(NumTailCalls, "Number of tail calls");
      92             : 
      93             : static cl::opt<bool> EnableVGPRIndexMode(
      94             :   "amdgpu-vgpr-index-mode",
      95             :   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
      96             :   cl::init(false));
      97             : 
      98             : static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
      99             :   "amdgpu-frame-index-zero-bits",
     100             :   cl::desc("High bits of frame index assumed to be zero"),
     101             :   cl::init(5),
     102             :   cl::ReallyHidden);
     103             : 
     104             : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
     105          45 :   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
     106         209 :   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
     107         418 :     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
     108             :       return AMDGPU::SGPR0 + Reg;
     109             :     }
     110             :   }
     111           0 :   llvm_unreachable("Cannot allocate sgpr");
     112             : }
     113             : 
     114        2492 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
     115        2492 :                                    const GCNSubtarget &STI)
     116             :     : AMDGPUTargetLowering(TM, STI),
     117        2492 :       Subtarget(&STI) {
     118             :   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
     119             :   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
     120             : 
     121             :   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
     122             :   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
     123             : 
     124             :   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
     125             :   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
     126             :   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
     127             : 
     128             :   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
     129             :   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
     130             : 
     131             :   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
     132             :   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     133             : 
     134             :   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
     135             :   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     136             : 
     137             :   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
     138             :   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     139             : 
     140        2492 :   if (Subtarget->has16BitInsts()) {
     141             :     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     142             :     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
     143             : 
     144             :     // Unless there are also VOP3P operations, not operations are really legal.
     145             :     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     146             :     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
     147             :     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     148             :     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     149             :   }
     150             : 
     151        2492 :   computeRegisterProperties(Subtarget->getRegisterInfo());
     152             : 
     153             :   // We need to custom lower vector stores from local memory
     154             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     155             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     156             :   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
     157             :   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
     158             :   setOperationAction(ISD::LOAD, MVT::i1, Custom);
     159             :   setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
     160             : 
     161             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     162             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     163             :   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
     164             :   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
     165             :   setOperationAction(ISD::STORE, MVT::i1, Custom);
     166             :   setOperationAction(ISD::STORE, MVT::v32i32, Custom);
     167             : 
     168             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     169             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
     170             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
     171             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
     172             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
     173             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
     174             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
     175             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
     176             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
     177             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
     178             : 
     179             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     180             :   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     181             : 
     182             :   setOperationAction(ISD::SELECT, MVT::i1, Promote);
     183             :   setOperationAction(ISD::SELECT, MVT::i64, Custom);
     184             :   setOperationAction(ISD::SELECT, MVT::f64, Promote);
     185             :   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
     186             : 
     187             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     188             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
     189             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     190             :   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     191             :   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
     192             : 
     193             :   setOperationAction(ISD::SETCC, MVT::i1, Promote);
     194             :   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     195             :   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     196             :   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
     197             : 
     198             :   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
     199             :   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
     200             : 
     201             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
     202             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
     203             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
     204             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
     205             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
     206             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
     207             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
     208             : 
     209             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     210             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
     211             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
     212             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
     213             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
     214             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
     215             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
     216             : 
     217             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
     218             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
     219             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
     220             : 
     221             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     222             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
     223             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
     224             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
     225             : 
     226             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     227             :   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
     228             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     229             :   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
     230             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     231             :   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     232             : 
     233             :   setOperationAction(ISD::UADDO, MVT::i32, Legal);
     234             :   setOperationAction(ISD::USUBO, MVT::i32, Legal);
     235             : 
     236             :   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
     237             :   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
     238             : 
     239             :   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
     240             :   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
     241             :   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
     242             : 
     243             : #if 0
     244             :   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
     245             :   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
     246             : #endif
     247             : 
     248             :   // We only support LOAD/STORE and vector manipulation ops for vectors
     249             :   // with > 4 elements.
     250       22428 :   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
     251       24920 :         MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
     252     5853708 :     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     253     5831280 :       switch (Op) {
     254             :       case ISD::LOAD:
     255             :       case ISD::STORE:
     256             :       case ISD::BUILD_VECTOR:
     257             :       case ISD::BITCAST:
     258             :       case ISD::EXTRACT_VECTOR_ELT:
     259             :       case ISD::INSERT_VECTOR_ELT:
     260             :       case ISD::INSERT_SUBVECTOR:
     261             :       case ISD::EXTRACT_SUBVECTOR:
     262             :       case ISD::SCALAR_TO_VECTOR:
     263             :         break;
     264       22428 :       case ISD::CONCAT_VECTORS:
     265             :         setOperationAction(Op, VT, Custom);
     266       22428 :         break;
     267     5607000 :       default:
     268             :         setOperationAction(Op, VT, Expand);
     269     5607000 :         break;
     270             :       }
     271             :     }
     272             :   }
     273             : 
     274             :   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
     275             : 
     276             :   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
     277             :   // is expanded to avoid having two separate loops in case the index is a VGPR.
     278             : 
     279             :   // Most operations are naturally 32-bit vector operations. We only support
     280             :   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
     281        7476 :   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
     282             :     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
     283             :     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     284             : 
     285             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     286             :     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
     287             : 
     288             :     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
     289             :     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
     290             : 
     291             :     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
     292             :     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
     293             :   }
     294             : 
     295             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     296             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     297             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     298             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     299             : 
     300             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
     301             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
     302             : 
     303             :   // Avoid stack access for these.
     304             :   // TODO: Generalize to more vector types.
     305             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
     306             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
     307             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     308             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     309             : 
     310             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     311             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     312             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
     313             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
     314             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
     315             : 
     316             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
     317             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
     318             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
     319             : 
     320             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
     321             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
     322             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     323             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     324             : 
     325             :   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
     326             :   // and output demarshalling
     327             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
     328             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
     329             : 
     330             :   // We can't return success/failure, only the old value,
     331             :   // let LLVM add the comparison
     332             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
     333             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
     334             : 
     335        2492 :   if (Subtarget->hasFlatAddressSpace()) {
     336             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     337             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
     338             :   }
     339             : 
     340             :   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
     341             :   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
     342             : 
     343             :   // On SI this is s_memtime and s_memrealtime on VI.
     344             :   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
     345             :   setOperationAction(ISD::TRAP, MVT::Other, Custom);
     346             :   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
     347             : 
     348        2492 :   if (Subtarget->has16BitInsts()) {
     349             :     setOperationAction(ISD::FLOG, MVT::f16, Custom);
     350             :     setOperationAction(ISD::FEXP, MVT::f16, Custom);
     351             :     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
     352             :   }
     353             : 
     354             :   // v_mad_f32 does not support denormals according to some sources.
     355        2492 :   if (!Subtarget->hasFP32Denormals())
     356             :     setOperationAction(ISD::FMAD, MVT::f32, Legal);
     357             : 
     358             :   if (!Subtarget->hasBFI()) {
     359             :     // fcopysign can be done in a single instruction with BFI.
     360             :     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
     361             :     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     362             :   }
     363             : 
     364             :   if (!Subtarget->hasBCNT(32))
     365             :     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
     366             : 
     367             :   if (!Subtarget->hasBCNT(64))
     368             :     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     369             : 
     370             :   if (Subtarget->hasFFBH())
     371             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
     372             : 
     373             :   if (Subtarget->hasFFBL())
     374             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
     375             : 
     376             :   // We only really have 32-bit BFE instructions (and 16-bit on VI).
     377             :   //
     378             :   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
     379             :   // effort to match them now. We want this to be false for i64 cases when the
     380             :   // extraction isn't restricted to the upper or lower half. Ideally we would
     381             :   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
     382             :   // span the midpoint are probably relatively rare, so don't worry about them
     383             :   // for now.
     384             :   if (Subtarget->hasBFE())
     385             :     setHasExtractBitsInsn(true);
     386             : 
     387             :   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     388             :   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     389             : 
     390        2492 :   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     391             :     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     392             :     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     393             :     setOperationAction(ISD::FRINT, MVT::f64, Legal);
     394             :   } else {
     395             :     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     396             :     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
     397             :     setOperationAction(ISD::FRINT, MVT::f64, Custom);
     398             :     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
     399             :   }
     400             : 
     401             :   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     402             : 
     403             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     404             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     405             :   setOperationAction(ISD::FDIV, MVT::f32, Custom);
     406             :   setOperationAction(ISD::FDIV, MVT::f64, Custom);
     407             : 
     408        2492 :   if (Subtarget->has16BitInsts()) {
     409             :     setOperationAction(ISD::Constant, MVT::i16, Legal);
     410             : 
     411             :     setOperationAction(ISD::SMIN, MVT::i16, Legal);
     412             :     setOperationAction(ISD::SMAX, MVT::i16, Legal);
     413             : 
     414             :     setOperationAction(ISD::UMIN, MVT::i16, Legal);
     415             :     setOperationAction(ISD::UMAX, MVT::i16, Legal);
     416             : 
     417             :     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     418             :     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
     419             : 
     420             :     setOperationAction(ISD::ROTR, MVT::i16, Promote);
     421             :     setOperationAction(ISD::ROTL, MVT::i16, Promote);
     422             : 
     423             :     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     424             :     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     425             :     setOperationAction(ISD::SREM, MVT::i16, Promote);
     426             :     setOperationAction(ISD::UREM, MVT::i16, Promote);
     427             : 
     428             :     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     429             :     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
     430             : 
     431             :     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
     432             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     433             :     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     434             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
     435             :     setOperationAction(ISD::CTPOP, MVT::i16, Promote);
     436             : 
     437             :     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
     438             : 
     439             :     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
     440             : 
     441             :     setOperationAction(ISD::LOAD, MVT::i16, Custom);
     442             : 
     443             :     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     444             : 
     445             :     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
     446             :     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
     447             :     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     448             :     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
     449             : 
     450             :     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
     451             :     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     452             :     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
     453             :     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     454             : 
     455             :     // F16 - Constant Actions.
     456             :     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
     457             : 
     458             :     // F16 - Load/Store Actions.
     459             :     setOperationAction(ISD::LOAD, MVT::f16, Promote);
     460             :     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
     461             :     setOperationAction(ISD::STORE, MVT::f16, Promote);
     462             :     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
     463             : 
     464             :     // F16 - VOP1 Actions.
     465             :     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
     466             :     setOperationAction(ISD::FCOS, MVT::f16, Promote);
     467             :     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     468             :     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
     469             :     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     470             :     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     471             :     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
     472             :     setOperationAction(ISD::FROUND, MVT::f16, Custom);
     473             : 
     474             :     // F16 - VOP2 Actions.
     475             :     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     476             :     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     477             :     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     478             :     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     479             :     setOperationAction(ISD::FDIV, MVT::f16, Custom);
     480             : 
     481             :     // F16 - VOP3 Actions.
     482             :     setOperationAction(ISD::FMA, MVT::f16, Legal);
     483        1240 :     if (!Subtarget->hasFP16Denormals())
     484             :       setOperationAction(ISD::FMAD, MVT::f16, Legal);
     485             : 
     486        6200 :     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
     487     1294560 :       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     488     1289600 :         switch (Op) {
     489             :         case ISD::LOAD:
     490             :         case ISD::STORE:
     491             :         case ISD::BUILD_VECTOR:
     492             :         case ISD::BITCAST:
     493             :         case ISD::EXTRACT_VECTOR_ELT:
     494             :         case ISD::INSERT_VECTOR_ELT:
     495             :         case ISD::INSERT_SUBVECTOR:
     496             :         case ISD::EXTRACT_SUBVECTOR:
     497             :         case ISD::SCALAR_TO_VECTOR:
     498             :           break;
     499        4960 :         case ISD::CONCAT_VECTORS:
     500             :           setOperationAction(Op, VT, Custom);
     501        4960 :           break;
     502     1240000 :         default:
     503             :           setOperationAction(Op, VT, Expand);
     504     1240000 :           break;
     505             :         }
     506             :       }
     507             :     }
     508             : 
     509             :     // XXX - Do these do anything? Vector constants turn into build_vector.
     510             :     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     511             :     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
     512             : 
     513             :     setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
     514             :     setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
     515             : 
     516             :     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     517             :     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     518             :     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
     519             :     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
     520             : 
     521             :     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
     522             :     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
     523             :     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
     524             :     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
     525             : 
     526             :     setOperationAction(ISD::AND, MVT::v2i16, Promote);
     527             :     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
     528             :     setOperationAction(ISD::OR, MVT::v2i16, Promote);
     529             :     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     530             :     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     531             :     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
     532             : 
     533             :     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
     534             :     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
     535             :     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
     536             :     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
     537             : 
     538             :     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
     539             :     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
     540             :     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     541             :     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
     542             : 
     543             :     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
     544             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     545             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
     546             :     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
     547             : 
     548             :     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
     549             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
     550             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
     551             : 
     552        1240 :     if (!Subtarget->hasVOP3PInsts()) {
     553             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
     554             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
     555             :     }
     556             : 
     557             :     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     558             :     // This isn't really legal, but this avoids the legalizer unrolling it (and
     559             :     // allows matching fneg (fabs x) patterns)
     560             :     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
     561             :   }
     562             : 
     563        2492 :   if (Subtarget->hasVOP3PInsts()) {
     564             :     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     565             :     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     566             :     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
     567             :     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
     568             :     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
     569             :     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
     570             :     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
     571             :     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
     572             :     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     573             :     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
     574             : 
     575             :     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     576             :     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     577             :     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     578             :     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     579             :     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
     580             :     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
     581             : 
     582             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     583             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     584             : 
     585             :     setOperationAction(ISD::SHL, MVT::v4i16, Custom);
     586             :     setOperationAction(ISD::SRA, MVT::v4i16, Custom);
     587             :     setOperationAction(ISD::SRL, MVT::v4i16, Custom);
     588             :     setOperationAction(ISD::ADD, MVT::v4i16, Custom);
     589             :     setOperationAction(ISD::SUB, MVT::v4i16, Custom);
     590             :     setOperationAction(ISD::MUL, MVT::v4i16, Custom);
     591             : 
     592             :     setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
     593             :     setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
     594             :     setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
     595             :     setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
     596             : 
     597             :     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     598             :     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
     599             :     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     600             :     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     601             :     setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
     602             : 
     603             :     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     604             :     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
     605             :     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
     606             :   }
     607             : 
     608             :   setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
     609             :   setOperationAction(ISD::FABS, MVT::v4f16, Custom);
     610             : 
     611        2492 :   if (Subtarget->has16BitInsts()) {
     612             :     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
     613             :     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
     614             :     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
     615             :     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
     616             :   } else {
     617             :     // Legalization hack.
     618             :     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     619             :     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
     620             : 
     621             :     setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
     622             :     setOperationAction(ISD::FABS, MVT::v2f16, Custom);
     623             :   }
     624             : 
     625       14952 :   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
     626             :     setOperationAction(ISD::SELECT, VT, Custom);
     627             :   }
     628             : 
     629             :   setTargetDAGCombine(ISD::ADD);
     630             :   setTargetDAGCombine(ISD::ADDCARRY);
     631             :   setTargetDAGCombine(ISD::SUB);
     632             :   setTargetDAGCombine(ISD::SUBCARRY);
     633             :   setTargetDAGCombine(ISD::FADD);
     634             :   setTargetDAGCombine(ISD::FSUB);
     635             :   setTargetDAGCombine(ISD::FMINNUM);
     636             :   setTargetDAGCombine(ISD::FMAXNUM);
     637             :   setTargetDAGCombine(ISD::FMA);
     638             :   setTargetDAGCombine(ISD::SMIN);
     639             :   setTargetDAGCombine(ISD::SMAX);
     640             :   setTargetDAGCombine(ISD::UMIN);
     641             :   setTargetDAGCombine(ISD::UMAX);
     642             :   setTargetDAGCombine(ISD::SETCC);
     643             :   setTargetDAGCombine(ISD::AND);
     644             :   setTargetDAGCombine(ISD::OR);
     645             :   setTargetDAGCombine(ISD::XOR);
     646             :   setTargetDAGCombine(ISD::SINT_TO_FP);
     647             :   setTargetDAGCombine(ISD::UINT_TO_FP);
     648             :   setTargetDAGCombine(ISD::FCANONICALIZE);
     649             :   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
     650             :   setTargetDAGCombine(ISD::ZERO_EXTEND);
     651             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     652             :   setTargetDAGCombine(ISD::BUILD_VECTOR);
     653             : 
     654             :   // All memory operations. Some folding on the pointer operand is done to help
     655             :   // matching the constant offsets in the addressing modes.
     656             :   setTargetDAGCombine(ISD::LOAD);
     657             :   setTargetDAGCombine(ISD::STORE);
     658             :   setTargetDAGCombine(ISD::ATOMIC_LOAD);
     659             :   setTargetDAGCombine(ISD::ATOMIC_STORE);
     660             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
     661             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     662             :   setTargetDAGCombine(ISD::ATOMIC_SWAP);
     663             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
     664             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
     665             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
     666             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
     667             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
     668             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
     669             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
     670             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
     671             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
     672             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
     673             : 
     674             :   setSchedulingPreference(Sched::RegPressure);
     675             : 
     676             :   // SI at least has hardware support for floating point exceptions, but no way
     677             :   // of using or handling them is implemented. They are also optional in OpenCL
     678             :   // (Section 7.3)
     679        2492 :   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
     680        2492 : }
     681             : 
     682     1042597 : const GCNSubtarget *SITargetLowering::getSubtarget() const {
     683     1042597 :   return Subtarget;
     684             : }
     685             : 
     686             : //===----------------------------------------------------------------------===//
     687             : // TargetLowering queries
     688             : //===----------------------------------------------------------------------===//
     689             : 
     690             : // v_mad_mix* support a conversion from f16 to f32.
     691             : //
     692             : // There is only one special case when denormals are enabled we don't currently,
     693             : // where this is OK to use.
     694          24 : bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
     695             :                                            EVT DestVT, EVT SrcVT) const {
     696          24 :   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
     697           2 :           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
     698          46 :          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
     699          11 :          SrcVT.getScalarType() == MVT::f16;
     700             : }
     701             : 
     702          32 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
     703             :   // SI has some legal vector types, but no legal vector operations. Say no
     704             :   // shuffles are legal in order to prefer scalarizing some vector operations.
     705          32 :   return false;
     706             : }
     707             : 
     708      146769 : MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
     709             :                                                     CallingConv::ID CC,
     710             :                                                     EVT VT) const {
     711             :   // TODO: Consider splitting all arguments into 32-bit pieces.
     712      175209 :   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
     713        8786 :     EVT ScalarVT = VT.getScalarType();
     714        8786 :     unsigned Size = ScalarVT.getSizeInBits();
     715        8786 :     if (Size == 32)
     716        8356 :       return ScalarVT.getSimpleVT();
     717             : 
     718        1705 :     if (Size == 64)
     719         167 :       return MVT::i32;
     720             : 
     721        1538 :     if (Size == 16 && Subtarget->has16BitInsts())
     722        1987 :       return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
     723             :   }
     724             : 
     725      138413 :   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
     726             : }
     727             : 
     728      146769 : unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
     729             :                                                          CallingConv::ID CC,
     730             :                                                          EVT VT) const {
     731      175209 :   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
     732             :     unsigned NumElts = VT.getVectorNumElements();
     733        8786 :     EVT ScalarVT = VT.getScalarType();
     734        8786 :     unsigned Size = ScalarVT.getSizeInBits();
     735             : 
     736        8786 :     if (Size == 32)
     737        8356 :       return NumElts;
     738             : 
     739        1705 :     if (Size == 64)
     740         167 :       return 2 * NumElts;
     741             : 
     742        1538 :     if (Size == 16 && Subtarget->has16BitInsts())
     743        1108 :       return (VT.getVectorNumElements() + 1) / 2;
     744             :   }
     745             : 
     746      138413 :   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
     747             : }
     748             : 
     749        3407 : unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     750             :   LLVMContext &Context, CallingConv::ID CC,
     751             :   EVT VT, EVT &IntermediateVT,
     752             :   unsigned &NumIntermediates, MVT &RegisterVT) const {
     753        6813 :   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
     754             :     unsigned NumElts = VT.getVectorNumElements();
     755        3406 :     EVT ScalarVT = VT.getScalarType();
     756        3406 :     unsigned Size = ScalarVT.getSizeInBits();
     757        3406 :     if (Size == 32) {
     758        2851 :       RegisterVT = ScalarVT.getSimpleVT();
     759        2851 :       IntermediateVT = RegisterVT;
     760        2851 :       NumIntermediates = NumElts;
     761        3187 :       return NumIntermediates;
     762             :     }
     763             : 
     764         555 :     if (Size == 64) {
     765          94 :       RegisterVT = MVT::i32;
     766          94 :       IntermediateVT = RegisterVT;
     767          94 :       NumIntermediates = 2 * NumElts;
     768          94 :       return NumIntermediates;
     769             :     }
     770             : 
     771             :     // FIXME: We should fix the ABI to be the same on targets without 16-bit
     772             :     // support, but unless we can properly handle 3-vectors, it will be still be
     773             :     // inconsistent.
     774         461 :     if (Size == 16 && Subtarget->has16BitInsts()) {
     775         242 :       RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
     776         242 :       IntermediateVT = RegisterVT;
     777         242 :       NumIntermediates = (NumElts + 1) / 2;
     778         242 :       return NumIntermediates;
     779             :     }
     780             :   }
     781             : 
     782         220 :   return TargetLowering::getVectorTypeBreakdownForCallingConv(
     783         220 :     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
     784             : }
     785             : 
     786       26154 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     787             :                                           const CallInst &CI,
     788             :                                           MachineFunction &MF,
     789             :                                           unsigned IntrID) const {
     790       26154 :   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
     791       26154 :           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
     792             :     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
     793        1918 :                                                   (Intrinsic::ID)IntrID);
     794        1918 :     if (Attr.hasFnAttribute(Attribute::ReadNone))
     795             :       return false;
     796             : 
     797        1822 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     798             : 
     799        1822 :     if (RsrcIntr->IsImage) {
     800         709 :       Info.ptrVal = MFI->getImagePSV(
     801         709 :         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
     802         709 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     803         709 :       Info.align = 0;
     804             :     } else {
     805        1113 :       Info.ptrVal = MFI->getBufferPSV(
     806        1113 :         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
     807        1113 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     808             :     }
     809             : 
     810        1822 :     Info.flags = MachineMemOperand::MODereferenceable;
     811        1822 :     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
     812        1071 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     813        1071 :       Info.memVT = MVT::getVT(CI.getType());
     814             :       Info.flags |= MachineMemOperand::MOLoad;
     815         751 :     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
     816         501 :       Info.opc = ISD::INTRINSIC_VOID;
     817         501 :       Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
     818             :       Info.flags |= MachineMemOperand::MOStore;
     819             :     } else {
     820             :       // Atomic
     821         250 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     822         250 :       Info.memVT = MVT::getVT(CI.getType());
     823             :       Info.flags = MachineMemOperand::MOLoad |
     824             :                    MachineMemOperand::MOStore |
     825             :                    MachineMemOperand::MODereferenceable;
     826             : 
     827             :       // XXX - Should this be volatile without known ordering?
     828             :       Info.flags |= MachineMemOperand::MOVolatile;
     829             :     }
     830        1822 :     return true;
     831             :   }
     832             : 
     833             :   switch (IntrID) {
     834         245 :   case Intrinsic::amdgcn_atomic_inc:
     835             :   case Intrinsic::amdgcn_atomic_dec:
     836             :   case Intrinsic::amdgcn_ds_fadd:
     837             :   case Intrinsic::amdgcn_ds_fmin:
     838             :   case Intrinsic::amdgcn_ds_fmax: {
     839         245 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     840         245 :     Info.memVT = MVT::getVT(CI.getType());
     841         245 :     Info.ptrVal = CI.getOperand(0);
     842         245 :     Info.align = 0;
     843         245 :     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     844             : 
     845             :     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     846         242 :     if (!Vol || !Vol->isZero())
     847             :       Info.flags |= MachineMemOperand::MOVolatile;
     848             : 
     849             :     return true;
     850             :   }
     851             : 
     852             :   default:
     853             :     return false;
     854             :   }
     855             : }
     856             : 
     857       31707 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
     858             :                                             SmallVectorImpl<Value*> &Ops,
     859             :                                             Type *&AccessTy) const {
     860             :   switch (II->getIntrinsicID()) {
     861         269 :   case Intrinsic::amdgcn_atomic_inc:
     862             :   case Intrinsic::amdgcn_atomic_dec:
     863             :   case Intrinsic::amdgcn_ds_fadd:
     864             :   case Intrinsic::amdgcn_ds_fmin:
     865             :   case Intrinsic::amdgcn_ds_fmax: {
     866         269 :     Value *Ptr = II->getArgOperand(0);
     867         269 :     AccessTy = II->getType();
     868         269 :     Ops.push_back(Ptr);
     869             :     return true;
     870             :   }
     871             :   default:
     872             :     return false;
     873             :   }
     874             : }
     875             : 
     876       54857 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
     877       54857 :   if (!Subtarget->hasFlatInstOffsets()) {
     878             :     // Flat instructions do not have offsets, and only have the register
     879             :     // address.
     880       84784 :     return AM.BaseOffs == 0 && AM.Scale == 0;
     881             :   }
     882             : 
     883             :   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
     884             :   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
     885             : 
     886             :   // Just r + i
     887        2704 :   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
     888             : }
     889             : 
     890      111122 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
     891      111122 :   if (Subtarget->hasFlatGlobalInsts())
     892       41446 :     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
     893             : 
     894       90399 :   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
     895             :       // Assume the we will use FLAT for all global memory accesses
     896             :       // on VI.
     897             :       // FIXME: This assumption is currently wrong.  On VI we still use
     898             :       // MUBUF instructions for the r + i addressing mode.  As currently
     899             :       // implemented, the MUBUF instructions only work on buffer < 4GB.
     900             :       // It may be possible to support > 4GB buffers with MUBUF instructions,
     901             :       // by setting the stride value in the resource descriptor which would
     902             :       // increase the size limit to (stride * 4GB).  However, this is risky,
     903             :       // because it has never been validated.
     904       43763 :     return isLegalFlatAddressingMode(AM);
     905             :   }
     906             : 
     907       46636 :   return isLegalMUBUFAddressingMode(AM);
     908             : }
     909             : 
     910       52973 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
     911             :   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     912             :   // additionally can do r + r + i with addr64. 32-bit has more addressing
     913             :   // mode options. Depending on the resource constant, it can also do
     914             :   // (i64 r0) + (i32 r1) * (i14 i).
     915             :   //
     916             :   // Private arrays end up using a scratch buffer most of the time, so also
     917             :   // assume those use MUBUF instructions. Scratch loads / stores are currently
     918             :   // implemented as mubuf instructions with offen bit set, so slightly
     919             :   // different than the normal addr64.
     920       52973 :   if (!isUInt<12>(AM.BaseOffs))
     921             :     return false;
     922             : 
     923             :   // FIXME: Since we can split immediate into soffset and immediate offset,
     924             :   // would it make sense to allow any immediate?
     925             : 
     926       52452 :   switch (AM.Scale) {
     927             :   case 0: // r + i or just i, depending on HasBaseReg.
     928             :     return true;
     929             :   case 1:
     930             :     return true; // We have r + r or r + i.
     931         855 :   case 2:
     932         855 :     if (AM.HasBaseReg) {
     933             :       // Reject 2 * r + r.
     934         855 :       return false;
     935             :     }
     936             : 
     937             :     // Allow 2 * r as r + r
     938             :     // Or  2 * r + i is allowed as r + r + i.
     939             :     return true;
     940       13051 :   default: // Don't allow n * r
     941       13051 :     return false;
     942             :   }
     943             : }
     944             : 
     945      221586 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     946             :                                              const AddrMode &AM, Type *Ty,
     947             :                                              unsigned AS, Instruction *I) const {
     948             :   // No global is ever allowed as a base.
     949      221586 :   if (AM.BaseGV)
     950             :     return false;
     951             : 
     952      218800 :   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
     953       82368 :     return isLegalGlobalAddressingMode(AM);
     954             : 
     955      272864 :   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
     956      136432 :       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
     957             :     // If the offset isn't a multiple of 4, it probably isn't going to be
     958             :     // correctly aligned.
     959             :     // FIXME: Can we get the real alignment here?
     960       97945 :     if (AM.BaseOffs % 4 != 0)
     961          99 :       return isLegalMUBUFAddressingMode(AM);
     962             : 
     963             :     // There are no SMRD extloads, so if we have to do a small type access we
     964             :     // will use a MUBUF load.
     965             :     // FIXME?: We also need to do this if unaligned, but we don't know the
     966             :     // alignment here.
     967      195692 :     if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
     968       28754 :       return isLegalGlobalAddressingMode(AM);
     969             : 
     970       69092 :     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     971             :       // SMRD instructions have an 8-bit, dword offset on SI.
     972       19754 :       if (!isUInt<8>(AM.BaseOffs / 4))
     973             :         return false;
     974       49338 :     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
     975             :       // On CI+, this can also be a 32-bit literal constant offset. If it fits
     976             :       // in 8-bits, it can use a smaller encoding.
     977        9715 :       if (!isUInt<32>(AM.BaseOffs / 4))
     978             :         return false;
     979       39623 :     } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     980             :       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
     981       39623 :       if (!isUInt<20>(AM.BaseOffs))
     982             :         return false;
     983             :     } else
     984           0 :       llvm_unreachable("unhandled generation");
     985             : 
     986       68951 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     987             :       return true;
     988             : 
     989         393 :     if (AM.Scale == 1 && AM.HasBaseReg)
     990             :       return true;
     991             : 
     992         393 :     return false;
     993             : 
     994       38487 :   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     995        6238 :     return isLegalMUBUFAddressingMode(AM);
     996       32249 :   } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
     997             :              AS == AMDGPUAS::REGION_ADDRESS) {
     998             :     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     999             :     // field.
    1000             :     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
    1001             :     // an 8-bit dword offset but we don't know the alignment here.
    1002       21155 :     if (!isUInt<16>(AM.BaseOffs))
    1003             :       return false;
    1004             : 
    1005       19369 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
    1006             :       return true;
    1007             : 
    1008        3786 :     if (AM.Scale == 1 && AM.HasBaseReg)
    1009             :       return true;
    1010             : 
    1011        2426 :     return false;
    1012       11094 :   } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
    1013             :              AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
    1014             :     // For an unknown address space, this usually means that this is for some
    1015             :     // reason being used for pure arithmetic, and not based on some addressing
    1016             :     // computation. We don't have instructions that compute pointers with any
    1017             :     // addressing modes, so treat them as having no offset like flat
    1018             :     // instructions.
    1019       11094 :     return isLegalFlatAddressingMode(AM);
    1020             :   } else {
    1021           0 :     llvm_unreachable("unhandled address space");
    1022             :   }
    1023             : }
    1024             : 
    1025       15492 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
    1026             :                                         const SelectionDAG &DAG) const {
    1027       15492 :   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
    1028        7251 :     return (MemVT.getSizeInBits() <= 4 * 32);
    1029        8241 :   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
    1030        3301 :     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
    1031        3301 :     return (MemVT.getSizeInBits() <= MaxPrivateBits);
    1032        4940 :   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
    1033        4940 :     return (MemVT.getSizeInBits() <= 2 * 32);
    1034             :   }
    1035             :   return true;
    1036             : }
    1037             : 
    1038      130502 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    1039             :                                                       unsigned AddrSpace,
    1040             :                                                       unsigned Align,
    1041             :                                                       bool *IsFast) const {
    1042      130502 :   if (IsFast)
    1043       83883 :     *IsFast = false;
    1044             : 
    1045             :   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
    1046             :   // which isn't a simple VT.
    1047             :   // Until MVT is extended to handle this, simply check for the size and
    1048             :   // rely on the condition below: allow accesses if the size is a multiple of 4.
    1049      130502 :   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
    1050             :                            VT.getStoreSize() > 16)) {
    1051           0 :     return false;
    1052             :   }
    1053             : 
    1054      130502 :   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
    1055             :       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
    1056             :     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
    1057             :     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
    1058             :     // with adjacent offsets.
    1059        8037 :     bool AlignedBy4 = (Align % 4 == 0);
    1060        8037 :     if (IsFast)
    1061        5971 :       *IsFast = AlignedBy4;
    1062             : 
    1063        8037 :     return AlignedBy4;
    1064             :   }
    1065             : 
    1066             :   // FIXME: We have to be conservative here and assume that flat operations
    1067             :   // will access scratch.  If we had access to the IR function, then we
    1068             :   // could determine if any private memory was used in the function.
    1069      122465 :   if (!Subtarget->hasUnalignedScratchAccess() &&
    1070      244858 :       (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
    1071      122429 :        AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
    1072         863 :     bool AlignedBy4 = Align >= 4;
    1073         863 :     if (IsFast)
    1074         673 :       *IsFast = AlignedBy4;
    1075             : 
    1076         863 :     return AlignedBy4;
    1077             :   }
    1078             : 
    1079      121602 :   if (Subtarget->hasUnalignedBufferAccess()) {
    1080             :     // If we have an uniform constant load, it still requires using a slow
    1081             :     // buffer instruction if unaligned.
    1082        6651 :     if (IsFast) {
    1083        4382 :       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
    1084        4382 :                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
    1085         675 :         (Align % 4 == 0) : true;
    1086             :     }
    1087             : 
    1088        6651 :     return true;
    1089             :   }
    1090             : 
    1091             :   // Smaller than dword value must be aligned.
    1092      114951 :   if (VT.bitsLT(MVT::i32))
    1093             :     return false;
    1094             : 
    1095             :   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
    1096             :   // byte-address are ignored, thus forcing Dword alignment.
    1097             :   // This applies to private, global, and constant memory.
    1098      113161 :   if (IsFast)
    1099       71273 :     *IsFast = true;
    1100             : 
    1101      117755 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1102             : }
    1103             : 
    1104         124 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
    1105             :                                           unsigned SrcAlign, bool IsMemset,
    1106             :                                           bool ZeroMemset,
    1107             :                                           bool MemcpyStrSrc,
    1108             :                                           MachineFunction &MF) const {
    1109             :   // FIXME: Should account for address space here.
    1110             : 
    1111             :   // The default fallback uses the private pointer size as a guess for a type to
    1112             :   // use. Make sure we switch these to 64-bit accesses.
    1113             : 
    1114         124 :   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
    1115          94 :     return MVT::v4i32;
    1116             : 
    1117          30 :   if (Size >= 8 && DstAlign >= 4)
    1118          12 :     return MVT::v2i32;
    1119             : 
    1120             :   // Use the default.
    1121          18 :   return MVT::Other;
    1122             : }
    1123             : 
    1124             : static bool isFlatGlobalAddrSpace(unsigned AS) {
    1125         667 :   return AS == AMDGPUAS::GLOBAL_ADDRESS ||
    1126             :          AS == AMDGPUAS::FLAT_ADDRESS ||
    1127         667 :          AS == AMDGPUAS::CONSTANT_ADDRESS;
    1128             : }
    1129             : 
    1130         246 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
    1131             :                                            unsigned DestAS) const {
    1132         246 :   return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
    1133             : }
    1134             : 
    1135        4734 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
    1136             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1137        4734 :   const Value *Ptr = MemNode->getMemOperand()->getValue();
    1138             :   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
    1139        9032 :   return I && I->getMetadata("amdgpu.noclobber");
    1140             : }
    1141             : 
    1142          94 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
    1143             :                                             unsigned DestAS) const {
    1144             :   // Flat -> private/local is a simple truncate.
    1145             :   // Flat -> global is no-op
    1146          94 :   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
    1147             :     return true;
    1148             : 
    1149          30 :   return isNoopAddrSpaceCast(SrcAS, DestAS);
    1150             : }
    1151             : 
    1152           0 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
    1153             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1154             : 
    1155           0 :   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
    1156             : }
    1157             : 
    1158             : TargetLoweringBase::LegalizeTypeAction
    1159      206860 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
    1160      206860 :   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
    1161      107180 :     return TypeSplitVector;
    1162             : 
    1163       99680 :   return TargetLoweringBase::getPreferredVectorAction(VT);
    1164             : }
    1165             : 
    1166          32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    1167             :                                                          Type *Ty) const {
    1168             :   // FIXME: Could be smarter if called for vector constants.
    1169          32 :   return true;
    1170             : }
    1171             : 
    1172      303233 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
    1173      303233 :   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
    1174       31162 :     switch (Op) {
    1175             :     case ISD::LOAD:
    1176             :     case ISD::STORE:
    1177             : 
    1178             :     // These operations are done with 32-bit instructions anyway.
    1179             :     case ISD::AND:
    1180             :     case ISD::OR:
    1181             :     case ISD::XOR:
    1182             :     case ISD::SELECT:
    1183             :       // TODO: Extensions?
    1184             :       return true;
    1185       26971 :     default:
    1186       26971 :       return false;
    1187             :     }
    1188             :   }
    1189             : 
    1190             :   // SimplifySetCC uses this function to determine whether or not it should
    1191             :   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
    1192         682 :   if (VT == MVT::i1 && Op == ISD::SETCC)
    1193          24 :     return false;
    1194             : 
    1195      272047 :   return TargetLowering::isTypeDesirableForOp(Op, VT);
    1196             : }
    1197             : 
    1198       41155 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
    1199             :                                                    const SDLoc &SL,
    1200             :                                                    SDValue Chain,
    1201             :                                                    uint64_t Offset) const {
    1202       41155 :   const DataLayout &DL = DAG.getDataLayout();
    1203       41155 :   MachineFunction &MF = DAG.getMachineFunction();
    1204       41155 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1205             : 
    1206             :   const ArgDescriptor *InputPtrReg;
    1207             :   const TargetRegisterClass *RC;
    1208             : 
    1209             :   std::tie(InputPtrReg, RC)
    1210             :     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1211             : 
    1212       41155 :   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    1213             :   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
    1214             :   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
    1215       41155 :     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
    1216             : 
    1217       41155 :   return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
    1218             : }
    1219             : 
    1220          42 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
    1221             :                                             const SDLoc &SL) const {
    1222          42 :   uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
    1223          42 :                                                FIRST_IMPLICIT);
    1224          42 :   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
    1225             : }
    1226             : 
    1227       41113 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
    1228             :                                          const SDLoc &SL, SDValue Val,
    1229             :                                          bool Signed,
    1230             :                                          const ISD::InputArg *Arg) const {
    1231       41113 :   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
    1232          80 :       VT.bitsLT(MemVT)) {
    1233           0 :     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
    1234           0 :     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
    1235             :   }
    1236             : 
    1237       41113 :   if (MemVT.isFloatingPoint())
    1238        2526 :     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
    1239       38587 :   else if (Signed)
    1240          14 :     Val = DAG.getSExtOrTrunc(Val, SL, VT);
    1241             :   else
    1242       38573 :     Val = DAG.getZExtOrTrunc(Val, SL, VT);
    1243             : 
    1244       41113 :   return Val;
    1245             : }
    1246             : 
    1247       41113 : SDValue SITargetLowering::lowerKernargMemParameter(
    1248             :   SelectionDAG &DAG, EVT VT, EVT MemVT,
    1249             :   const SDLoc &SL, SDValue Chain,
    1250             :   uint64_t Offset, unsigned Align, bool Signed,
    1251             :   const ISD::InputArg *Arg) const {
    1252       41113 :   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
    1253       41113 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
    1254       41113 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    1255             : 
    1256             :   // Try to avoid using an extload by loading earlier than the argument address,
    1257             :   // and extracting the relevant bits. The load should hopefully be merged with
    1258             :   // the previous argument.
    1259       41113 :   if (MemVT.getStoreSize() < 4 && Align < 4) {
    1260             :     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
    1261             :     int64_t AlignDownOffset = alignDown(Offset, 4);
    1262        1688 :     int64_t OffsetDiff = Offset - AlignDownOffset;
    1263             : 
    1264        1688 :     EVT IntVT = MemVT.changeTypeToInteger();
    1265             : 
    1266             :     // TODO: If we passed in the base kernel offset we could have a better
    1267             :     // alignment than 4, but we don't really need it.
    1268        1688 :     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
    1269             :     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
    1270             :                                MachineMemOperand::MODereferenceable |
    1271        1688 :                                MachineMemOperand::MOInvariant);
    1272             : 
    1273        1688 :     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
    1274        1688 :     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
    1275             : 
    1276        1688 :     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
    1277        1688 :     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
    1278        1688 :     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
    1279             : 
    1280             : 
    1281        3376 :     return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
    1282             :   }
    1283             : 
    1284       39425 :   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
    1285             :   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
    1286             :                              MachineMemOperand::MODereferenceable |
    1287       39425 :                              MachineMemOperand::MOInvariant);
    1288             : 
    1289       39425 :   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
    1290       78850 :   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
    1291             : }
    1292             : 
    1293         402 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
    1294             :                                               const SDLoc &SL, SDValue Chain,
    1295             :                                               const ISD::InputArg &Arg) const {
    1296         402 :   MachineFunction &MF = DAG.getMachineFunction();
    1297         402 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1298             : 
    1299         402 :   if (Arg.Flags.isByVal()) {
    1300          71 :     unsigned Size = Arg.Flags.getByValSize();
    1301          71 :     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
    1302          71 :     return DAG.getFrameIndex(FrameIdx, MVT::i32);
    1303             :   }
    1304             : 
    1305         331 :   unsigned ArgOffset = VA.getLocMemOffset();
    1306         331 :   unsigned ArgSize = VA.getValVT().getStoreSize();
    1307             : 
    1308         331 :   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
    1309             : 
    1310             :   // Create load nodes to retrieve arguments from the stack.
    1311         331 :   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
    1312             :   SDValue ArgValue;
    1313             : 
    1314             :   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
    1315             :   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
    1316             :   MVT MemVT = VA.getValVT();
    1317             : 
    1318         331 :   switch (VA.getLocInfo()) {
    1319             :   default:
    1320             :     break;
    1321           0 :   case CCValAssign::BCvt:
    1322             :     MemVT = VA.getLocVT();
    1323           0 :     break;
    1324           0 :   case CCValAssign::SExt:
    1325             :     ExtType = ISD::SEXTLOAD;
    1326           0 :     break;
    1327           0 :   case CCValAssign::ZExt:
    1328             :     ExtType = ISD::ZEXTLOAD;
    1329           0 :     break;
    1330           3 :   case CCValAssign::AExt:
    1331             :     ExtType = ISD::EXTLOAD;
    1332           3 :     break;
    1333             :   }
    1334             : 
    1335         331 :   ArgValue = DAG.getExtLoad(
    1336             :     ExtType, SL, VA.getLocVT(), Chain, FIN,
    1337             :     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
    1338         331 :     MemVT);
    1339         331 :   return ArgValue;
    1340             : }
    1341             : 
    1342       13733 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
    1343             :   const SIMachineFunctionInfo &MFI,
    1344             :   EVT VT,
    1345             :   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
    1346             :   const ArgDescriptor *Reg;
    1347             :   const TargetRegisterClass *RC;
    1348             : 
    1349             :   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
    1350       13733 :   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
    1351             : }
    1352             : 
    1353           0 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
    1354             :                                    CallingConv::ID CallConv,
    1355             :                                    ArrayRef<ISD::InputArg> Ins,
    1356             :                                    BitVector &Skipped,
    1357             :                                    FunctionType *FType,
    1358             :                                    SIMachineFunctionInfo *Info) {
    1359           0 :   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    1360           0 :     const ISD::InputArg *Arg = &Ins[I];
    1361             : 
    1362             :     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
    1363             :            "vector type argument should have been split");
    1364             : 
    1365             :     // First check if it's a PS input addr.
    1366           0 :     if (CallConv == CallingConv::AMDGPU_PS &&
    1367           0 :         !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
    1368             : 
    1369           0 :       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
    1370             : 
    1371             :       // Inconveniently only the first part of the split is marked as isSplit,
    1372             :       // so skip to the end. We only want to increment PSInputNum once for the
    1373             :       // entire split argument.
    1374           0 :       if (Arg->Flags.isSplit()) {
    1375           0 :         while (!Arg->Flags.isSplitEnd()) {
    1376             :           assert(!Arg->VT.isVector() &&
    1377             :                  "unexpected vector split in ps argument type");
    1378           0 :           if (!SkipArg)
    1379           0 :             Splits.push_back(*Arg);
    1380           0 :           Arg = &Ins[++I];
    1381             :         }
    1382             :       }
    1383             : 
    1384           0 :       if (SkipArg) {
    1385             :         // We can safely skip PS inputs.
    1386           0 :         Skipped.set(Arg->getOrigArgIndex());
    1387           0 :         ++PSInputNum;
    1388           0 :         continue;
    1389             :       }
    1390             : 
    1391             :       Info->markPSInputAllocated(PSInputNum);
    1392           0 :       if (Arg->Used)
    1393             :         Info->markPSInputEnabled(PSInputNum);
    1394             : 
    1395           0 :       ++PSInputNum;
    1396             :     }
    1397             : 
    1398           0 :     Splits.push_back(*Arg);
    1399             :   }
    1400           0 : }
    1401             : 
    1402             : // Allocate special inputs passed in VGPRs.
    1403           0 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
    1404             :                                            MachineFunction &MF,
    1405             :                                            const SIRegisterInfo &TRI,
    1406             :                                            SIMachineFunctionInfo &Info) {
    1407           0 :   if (Info.hasWorkItemIDX()) {
    1408             :     unsigned Reg = AMDGPU::VGPR0;
    1409           0 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1410             : 
    1411           0 :     CCInfo.AllocateReg(Reg);
    1412           0 :     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
    1413             :   }
    1414             : 
    1415           0 :   if (Info.hasWorkItemIDY()) {
    1416             :     unsigned Reg = AMDGPU::VGPR1;
    1417           0 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1418             : 
    1419           0 :     CCInfo.AllocateReg(Reg);
    1420           0 :     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
    1421             :   }
    1422             : 
    1423           0 :   if (Info.hasWorkItemIDZ()) {
    1424             :     unsigned Reg = AMDGPU::VGPR2;
    1425           0 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1426             : 
    1427           0 :     CCInfo.AllocateReg(Reg);
    1428           0 :     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
    1429             :   }
    1430           0 : }
    1431             : 
    1432             : // Try to allocate a VGPR at the end of the argument list, or if no argument
    1433             : // VGPRs are left allocating a stack slot.
    1434          38 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
    1435             :   ArrayRef<MCPhysReg> ArgVGPRs
    1436          38 :     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    1437             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
    1438          38 :   if (RegIdx == ArgVGPRs.size()) {
    1439             :     // Spill to stack required.
    1440           8 :     int64_t Offset = CCInfo.AllocateStack(4, 4);
    1441             : 
    1442             :     return ArgDescriptor::createStack(Offset);
    1443             :   }
    1444             : 
    1445          30 :   unsigned Reg = ArgVGPRs[RegIdx];
    1446          30 :   Reg = CCInfo.AllocateReg(Reg);
    1447             :   assert(Reg != AMDGPU::NoRegister);
    1448             : 
    1449          30 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1450          30 :   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1451             :   return ArgDescriptor::createRegister(Reg);
    1452             : }
    1453             : 
    1454           0 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
    1455             :                                              const TargetRegisterClass *RC,
    1456             :                                              unsigned NumArgRegs) {
    1457           0 :   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
    1458             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
    1459           0 :   if (RegIdx == ArgSGPRs.size())
    1460           0 :     report_fatal_error("ran out of SGPRs for arguments");
    1461             : 
    1462           0 :   unsigned Reg = ArgSGPRs[RegIdx];
    1463           0 :   Reg = CCInfo.AllocateReg(Reg);
    1464             :   assert(Reg != AMDGPU::NoRegister);
    1465             : 
    1466           0 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1467           0 :   MF.addLiveIn(Reg, RC);
    1468           0 :   return ArgDescriptor::createRegister(Reg);
    1469             : }
    1470             : 
    1471             : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
    1472           0 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
    1473             : }
    1474             : 
    1475             : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
    1476           0 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
    1477             : }
    1478             : 
    1479           0 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
    1480             :                                       MachineFunction &MF,
    1481             :                                       const SIRegisterInfo &TRI,
    1482             :                                       SIMachineFunctionInfo &Info) {
    1483           0 :   if (Info.hasWorkItemIDX())
    1484           0 :     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
    1485             : 
    1486           0 :   if (Info.hasWorkItemIDY())
    1487           0 :     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
    1488             : 
    1489           0 :   if (Info.hasWorkItemIDZ())
    1490           0 :     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
    1491           0 : }
    1492             : 
    1493           0 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
    1494             :                                       MachineFunction &MF,
    1495             :                                       const SIRegisterInfo &TRI,
    1496             :                                       SIMachineFunctionInfo &Info) {
    1497             :   auto &ArgInfo = Info.getArgInfo();
    1498             : 
    1499             :   // TODO: Unify handling with private memory pointers.
    1500             : 
    1501           0 :   if (Info.hasDispatchPtr())
    1502           0 :     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
    1503             : 
    1504           0 :   if (Info.hasQueuePtr())
    1505           0 :     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
    1506             : 
    1507           0 :   if (Info.hasKernargSegmentPtr())
    1508           0 :     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
    1509             : 
    1510           0 :   if (Info.hasDispatchID())
    1511           0 :     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
    1512             : 
    1513             :   // flat_scratch_init is not applicable for non-kernel functions.
    1514             : 
    1515           0 :   if (Info.hasWorkGroupIDX())
    1516           0 :     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
    1517             : 
    1518           0 :   if (Info.hasWorkGroupIDY())
    1519           0 :     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
    1520             : 
    1521           0 :   if (Info.hasWorkGroupIDZ())
    1522           0 :     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
    1523             : 
    1524           0 :   if (Info.hasImplicitArgPtr())
    1525           0 :     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
    1526           0 : }
    1527             : 
    1528             : // Allocate special inputs passed in user SGPRs.
    1529       17950 : static void allocateHSAUserSGPRs(CCState &CCInfo,
    1530             :                                  MachineFunction &MF,
    1531             :                                  const SIRegisterInfo &TRI,
    1532             :                                  SIMachineFunctionInfo &Info) {
    1533       17950 :   if (Info.hasImplicitBufferPtr()) {
    1534           2 :     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
    1535           2 :     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    1536           2 :     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    1537             :   }
    1538             : 
    1539             :   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    1540       17950 :   if (Info.hasPrivateSegmentBuffer()) {
    1541        2564 :     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    1542        2564 :     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    1543        2564 :     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    1544             :   }
    1545             : 
    1546       17950 :   if (Info.hasDispatchPtr()) {
    1547          42 :     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
    1548          42 :     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    1549          42 :     CCInfo.AllocateReg(DispatchPtrReg);
    1550             :   }
    1551             : 
    1552       17950 :   if (Info.hasQueuePtr()) {
    1553          57 :     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
    1554          57 :     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    1555          57 :     CCInfo.AllocateReg(QueuePtrReg);
    1556             :   }
    1557             : 
    1558       17950 :   if (Info.hasKernargSegmentPtr()) {
    1559       15176 :     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
    1560       15176 :     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
    1561       15176 :     CCInfo.AllocateReg(InputPtrReg);
    1562             :   }
    1563             : 
    1564       17950 :   if (Info.hasDispatchID()) {
    1565           5 :     unsigned DispatchIDReg = Info.addDispatchID(TRI);
    1566           5 :     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    1567           5 :     CCInfo.AllocateReg(DispatchIDReg);
    1568             :   }
    1569             : 
    1570       17950 :   if (Info.hasFlatScratchInit()) {
    1571         381 :     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    1572         381 :     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    1573         381 :     CCInfo.AllocateReg(FlatScratchInitReg);
    1574             :   }
    1575             : 
    1576             :   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    1577             :   // these from the dispatch pointer.
    1578       17950 : }
    1579             : 
    1580             : // Allocate special input registers that are initialized per-wave.
    1581       17950 : static void allocateSystemSGPRs(CCState &CCInfo,
    1582             :                                 MachineFunction &MF,
    1583             :                                 SIMachineFunctionInfo &Info,
    1584             :                                 CallingConv::ID CallConv,
    1585             :                                 bool IsShader) {
    1586       17950 :   if (Info.hasWorkGroupIDX()) {
    1587             :     unsigned Reg = Info.addWorkGroupIDX();
    1588       16213 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1589       16213 :     CCInfo.AllocateReg(Reg);
    1590             :   }
    1591             : 
    1592       17950 :   if (Info.hasWorkGroupIDY()) {
    1593             :     unsigned Reg = Info.addWorkGroupIDY();
    1594          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1595          24 :     CCInfo.AllocateReg(Reg);
    1596             :   }
    1597             : 
    1598       17950 :   if (Info.hasWorkGroupIDZ()) {
    1599             :     unsigned Reg = Info.addWorkGroupIDZ();
    1600          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1601          24 :     CCInfo.AllocateReg(Reg);
    1602             :   }
    1603             : 
    1604       17950 :   if (Info.hasWorkGroupInfo()) {
    1605             :     unsigned Reg = Info.addWorkGroupInfo();
    1606           0 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1607           0 :     CCInfo.AllocateReg(Reg);
    1608             :   }
    1609             : 
    1610       17950 :   if (Info.hasPrivateSegmentWaveByteOffset()) {
    1611             :     // Scratch wave offset passed in system SGPR.
    1612             :     unsigned PrivateSegmentWaveByteOffsetReg;
    1613             : 
    1614       16263 :     if (IsShader) {
    1615             :       PrivateSegmentWaveByteOffsetReg =
    1616             :         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
    1617             : 
    1618             :       // This is true if the scratch wave byte offset doesn't have a fixed
    1619             :       // location.
    1620          50 :       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
    1621             :         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
    1622             :         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
    1623             :       }
    1624             :     } else
    1625             :       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
    1626             : 
    1627       16263 :     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
    1628       16263 :     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    1629             :   }
    1630       17950 : }
    1631             : 
    1632       17974 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
    1633             :                                      MachineFunction &MF,
    1634             :                                      const SIRegisterInfo &TRI,
    1635             :                                      SIMachineFunctionInfo &Info) {
    1636             :   // Now that we've figured out where the scratch register inputs are, see if
    1637             :   // should reserve the arguments and use them directly.
    1638       17974 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1639             :   bool HasStackObjects = MFI.hasStackObjects();
    1640             : 
    1641             :   // Record that we know we have non-spill stack objects so we don't need to
    1642             :   // check all stack objects later.
    1643       17974 :   if (HasStackObjects)
    1644             :     Info.setHasNonSpillStackObjects(true);
    1645             : 
    1646             :   // Everything live out of a block is spilled with fast regalloc, so it's
    1647             :   // almost certain that spilling will be required.
    1648       17974 :   if (TM.getOptLevel() == CodeGenOpt::None)
    1649             :     HasStackObjects = true;
    1650             : 
    1651             :   // For now assume stack access is needed in any callee functions, so we need
    1652             :   // the scratch registers to pass in.
    1653       17790 :   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
    1654             : 
    1655       17974 :   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    1656       17974 :   if (ST.isAmdHsaOrMesa(MF.getFunction())) {
    1657        2567 :     if (RequiresStackAccess) {
    1658             :       // If we have stack objects, we unquestionably need the private buffer
    1659             :       // resource. For the Code Object V2 ABI, this will be the first 4 user
    1660             :       // SGPR inputs. We can reserve those and use them directly.
    1661             : 
    1662             :       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
    1663             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
    1664             :       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
    1665             : 
    1666         484 :       if (MFI.hasCalls()) {
    1667             :         // If we have calls, we need to keep the frame register in a register
    1668             :         // that won't be clobbered by a call, so ensure it is copied somewhere.
    1669             : 
    1670             :         // This is not a problem for the scratch wave offset, because the same
    1671             :         // registers are reserved in all functions.
    1672             : 
    1673             :         // FIXME: Nothing is really ensuring this is a call preserved register,
    1674             :         // it's just selected from the end so it happens to be.
    1675             :         unsigned ReservedOffsetReg
    1676         265 :           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1677             :         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1678             :       } else {
    1679             :         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
    1680             :           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1681             :         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
    1682             :       }
    1683             :     } else {
    1684             :       unsigned ReservedBufferReg
    1685        2083 :         = TRI.reservedPrivateSegmentBufferReg(MF);
    1686             :       unsigned ReservedOffsetReg
    1687        2083 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1688             : 
    1689             :       // We tentatively reserve the last registers (skipping the last two
    1690             :       // which may contain VCC). After register allocation, we'll replace
    1691             :       // these with the ones immediately after those which were really
    1692             :       // allocated. In the prologue copies will be inserted from the argument
    1693             :       // to these reserved registers.
    1694             :       Info.setScratchRSrcReg(ReservedBufferReg);
    1695             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1696             :     }
    1697             :   } else {
    1698       15407 :     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
    1699             : 
    1700             :     // Without HSA, relocations are used for the scratch pointer and the
    1701             :     // buffer resource setup is always inserted in the prologue. Scratch wave
    1702             :     // offset is still in an input SGPR.
    1703             :     Info.setScratchRSrcReg(ReservedBufferReg);
    1704             : 
    1705       15407 :     if (HasStackObjects && !MFI.hasCalls()) {
    1706             :       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
    1707             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1708             :       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
    1709             :     } else {
    1710             :       unsigned ReservedOffsetReg
    1711       15081 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1712             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1713             :     }
    1714             :   }
    1715       17974 : }
    1716             : 
    1717       19524 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
    1718       19524 :   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1719       19524 :   return !Info->isEntryFunction();
    1720             : }
    1721             : 
    1722        1755 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
    1723             : 
    1724        1755 : }
    1725             : 
    1726        1754 : void SITargetLowering::insertCopiesSplitCSR(
    1727             :   MachineBasicBlock *Entry,
    1728             :   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
    1729        1754 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1730             : 
    1731        1754 :   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
    1732        1754 :   if (!IStart)
    1733        1754 :     return;
    1734             : 
    1735           0 :   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    1736           0 :   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
    1737           0 :   MachineBasicBlock::iterator MBBI = Entry->begin();
    1738           0 :   for (const MCPhysReg *I = IStart; *I; ++I) {
    1739             :     const TargetRegisterClass *RC = nullptr;
    1740           0 :     if (AMDGPU::SReg_64RegClass.contains(*I))
    1741             :       RC = &AMDGPU::SGPR_64RegClass;
    1742           0 :     else if (AMDGPU::SReg_32RegClass.contains(*I))
    1743             :       RC = &AMDGPU::SGPR_32RegClass;
    1744             :     else
    1745           0 :       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1746             : 
    1747           0 :     unsigned NewVR = MRI->createVirtualRegister(RC);
    1748             :     // Create copy from CSR to a virtual register.
    1749           0 :     Entry->addLiveIn(*I);
    1750           0 :     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
    1751           0 :       .addReg(*I);
    1752             : 
    1753             :     // Insert the copy-back instructions right before the terminator.
    1754           0 :     for (auto *Exit : Exits)
    1755           0 :       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
    1756           0 :               TII->get(TargetOpcode::COPY), *I)
    1757           0 :         .addReg(NewVR);
    1758             :   }
    1759             : }
    1760             : 
    1761       19712 : SDValue SITargetLowering::LowerFormalArguments(
    1762             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1763             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1764             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1765       19712 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1766             : 
    1767       19712 :   MachineFunction &MF = DAG.getMachineFunction();
    1768       19712 :   const Function &Fn = MF.getFunction();
    1769             :   FunctionType *FType = MF.getFunction().getFunctionType();
    1770       19712 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1771       19712 :   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    1772             : 
    1773       39424 :   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
    1774             :     DiagnosticInfoUnsupported NoGraphicsHSA(
    1775           3 :         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    1776           3 :     DAG.getContext()->diagnose(NoGraphicsHSA);
    1777             :     return DAG.getEntryNode();
    1778             :   }
    1779             : 
    1780             :   // Create stack objects that are used for emitting debugger prologue if
    1781             :   // "amdgpu-debugger-emit-prologue" attribute was specified.
    1782       19709 :   if (ST.debuggerEmitPrologue())
    1783           4 :     createDebuggerPrologueStackObjects(MF);
    1784             : 
    1785             :   SmallVector<ISD::InputArg, 16> Splits;
    1786             :   SmallVector<CCValAssign, 16> ArgLocs;
    1787       19709 :   BitVector Skipped(Ins.size());
    1788             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1789       39418 :                  *DAG.getContext());
    1790             : 
    1791       19709 :   bool IsShader = AMDGPU::isShader(CallConv);
    1792             :   bool IsKernel = AMDGPU::isKernel(CallConv);
    1793       19709 :   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
    1794             : 
    1795       19709 :   if (!IsEntryFunc) {
    1796             :     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
    1797             :     // this when allocating argument fixed offsets.
    1798        1759 :     CCInfo.AllocateStack(4, 4);
    1799             :   }
    1800             : 
    1801       19709 :   if (IsShader) {
    1802        1737 :     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
    1803             : 
    1804             :     // At least one interpolation mode must be enabled or else the GPU will
    1805             :     // hang.
    1806             :     //
    1807             :     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    1808             :     // set PSInputAddr, the user wants to enable some bits after the compilation
    1809             :     // based on run-time states. Since we can't know what the final PSInputEna
    1810             :     // will look like, so we shouldn't do anything here and the user should take
    1811             :     // responsibility for the correct programming.
    1812             :     //
    1813             :     // Otherwise, the following restrictions apply:
    1814             :     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    1815             :     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    1816             :     //   enabled too.
    1817        1737 :     if (CallConv == CallingConv::AMDGPU_PS) {
    1818        1433 :       if ((Info->getPSInputAddr() & 0x7F) == 0 ||
    1819        1136 :            ((Info->getPSInputAddr() & 0xF) == 0 &&
    1820             :             Info->isPSInputAllocated(11))) {
    1821             :         CCInfo.AllocateReg(AMDGPU::VGPR0);
    1822             :         CCInfo.AllocateReg(AMDGPU::VGPR1);
    1823             :         Info->markPSInputAllocated(0);
    1824             :         Info->markPSInputEnabled(0);
    1825             :       }
    1826        2866 :       if (Subtarget->isAmdPalOS()) {
    1827             :         // For isAmdPalOS, the user does not enable some bits after compilation
    1828             :         // based on run-time states; the register values being generated here are
    1829             :         // the final ones set in hardware. Therefore we need to apply the
    1830             :         // workaround to PSInputAddr and PSInputEnable together.  (The case where
    1831             :         // a bit is set in PSInputAddr but not PSInputEnable is where the
    1832             :         // frontend set up an input arg for a particular interpolation mode, but
    1833             :         // nothing uses that input arg. Really we should have an earlier pass
    1834             :         // that removes such an arg.)
    1835          17 :         unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
    1836          17 :         if ((PsInputBits & 0x7F) == 0 ||
    1837          14 :             ((PsInputBits & 0xF) == 0 &&
    1838             :              (PsInputBits >> 11 & 1)))
    1839           3 :           Info->markPSInputEnabled(
    1840             :               countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    1841             :       }
    1842             :     }
    1843             : 
    1844             :     assert(!Info->hasDispatchPtr() &&
    1845             :            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
    1846             :            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
    1847             :            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
    1848             :            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
    1849             :            !Info->hasWorkItemIDZ());
    1850       17972 :   } else if (IsKernel) {
    1851             :     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
    1852             :   } else {
    1853        1759 :     Splits.append(Ins.begin(), Ins.end());
    1854             :   }
    1855             : 
    1856       19709 :   if (IsEntryFunc) {
    1857       17950 :     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    1858       17950 :     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
    1859             :   }
    1860             : 
    1861       19709 :   if (IsKernel) {
    1862       16213 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1863             :   } else {
    1864        3496 :     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
    1865        3496 :     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
    1866             :   }
    1867             : 
    1868             :   SmallVector<SDValue, 16> Chains;
    1869             : 
    1870             :   // FIXME: This is the minimum kernel argument alignment. We should improve
    1871             :   // this to the maximum alignment of the arguments.
    1872             :   //
    1873             :   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
    1874             :   // kern arg offset.
    1875             :   const unsigned KernelArgBaseAlign = 16;
    1876             : 
    1877       83072 :    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    1878       63363 :     const ISD::InputArg &Arg = Ins[i];
    1879       63363 :     if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
    1880        4412 :       InVals.push_back(DAG.getUNDEF(Arg.VT));
    1881       43618 :       continue;
    1882             :     }
    1883             : 
    1884       61157 :     CCValAssign &VA = ArgLocs[ArgIdx++];
    1885             :     MVT VT = VA.getLocVT();
    1886             : 
    1887       61157 :     if (IsEntryFunc && VA.isMemLoc()) {
    1888       41010 :       VT = Ins[i].VT;
    1889             :       EVT MemVT = VA.getLocVT();
    1890             : 
    1891       41010 :       const uint64_t Offset = VA.getLocMemOffset();
    1892       41010 :       unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
    1893             : 
    1894             :       SDValue Arg = lowerKernargMemParameter(
    1895       82020 :         DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
    1896       41010 :       Chains.push_back(Arg.getValue(1));
    1897             : 
    1898             :       auto *ParamTy =
    1899       41010 :         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
    1900       14128 :       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
    1901       49102 :           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    1902             :         // On SI local pointers are just offsets into LDS, so they are always
    1903             :         // less than 16-bits.  On CI and newer they could potentially be
    1904             :         // real pointers, so we can't guarantee their size.
    1905         644 :         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
    1906         644 :                           DAG.getValueType(MVT::i16));
    1907             :       }
    1908             : 
    1909       41010 :       InVals.push_back(Arg);
    1910             :       continue;
    1911       20147 :     } else if (!IsEntryFunc && VA.isMemLoc()) {
    1912         402 :       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
    1913         402 :       InVals.push_back(Val);
    1914         402 :       if (!Arg.Flags.isByVal())
    1915         331 :         Chains.push_back(Val.getValue(1));
    1916             :       continue;
    1917             :     }
    1918             : 
    1919             :     assert(VA.isRegLoc() && "Parameter must be in a register!");
    1920             : 
    1921       19745 :     unsigned Reg = VA.getLocReg();
    1922       19745 :     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    1923             :     EVT ValVT = VA.getValVT();
    1924             : 
    1925       19745 :     Reg = MF.addLiveIn(Reg, RC);
    1926       19745 :     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1927             : 
    1928       19745 :     if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
    1929             :       // The return object should be reasonably addressable.
    1930             : 
    1931             :       // FIXME: This helps when the return is a real sret. If it is a
    1932             :       // automatically inserted sret (i.e. CanLowerReturn returns false), an
    1933             :       // extra copy is inserted in SelectionDAGBuilder which obscures this.
    1934          12 :       unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
    1935          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1936          12 :         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
    1937             :     }
    1938             : 
    1939             :     // If this is an 8 or 16-bit value, it is really passed promoted
    1940             :     // to 32 bits. Insert an assert[sz]ext to capture this, then
    1941             :     // truncate to the right size.
    1942       19745 :     switch (VA.getLocInfo()) {
    1943             :     case CCValAssign::Full:
    1944             :       break;
    1945             :     case CCValAssign::BCvt:
    1946           0 :       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
    1947           0 :       break;
    1948             :     case CCValAssign::SExt:
    1949           8 :       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
    1950           8 :                         DAG.getValueType(ValVT));
    1951           8 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1952           8 :       break;
    1953             :     case CCValAssign::ZExt:
    1954          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1955          12 :                         DAG.getValueType(ValVT));
    1956          12 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1957          12 :       break;
    1958             :     case CCValAssign::AExt:
    1959           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1960           7 :       break;
    1961           0 :     default:
    1962           0 :       llvm_unreachable("Unknown loc info!");
    1963             :     }
    1964             : 
    1965       19745 :     InVals.push_back(Val);
    1966             :   }
    1967             : 
    1968       19709 :   if (!IsEntryFunc) {
    1969             :     // Special inputs come after user arguments.
    1970        1759 :     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    1971             :   }
    1972             : 
    1973             :   // Start adding system SGPRs.
    1974       19709 :   if (IsEntryFunc) {
    1975       17950 :     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
    1976             :   } else {
    1977        1759 :     CCInfo.AllocateReg(Info->getScratchRSrcReg());
    1978        1759 :     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
    1979        1759 :     CCInfo.AllocateReg(Info->getFrameOffsetReg());
    1980        1759 :     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    1981             :   }
    1982             : 
    1983             :   auto &ArgUsageInfo =
    1984       19709 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1985       19709 :   ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
    1986             : 
    1987       19709 :   unsigned StackArgSize = CCInfo.getNextStackOffset();
    1988             :   Info->setBytesInStackArgArea(StackArgSize);
    1989             : 
    1990       19709 :   return Chains.empty() ? Chain :
    1991       19709 :     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    1992             : }
    1993             : 
    1994             : // TODO: If return values can't fit in registers, we should return as many as
    1995             : // possible in registers before passing on stack.
    1996       20295 : bool SITargetLowering::CanLowerReturn(
    1997             :   CallingConv::ID CallConv,
    1998             :   MachineFunction &MF, bool IsVarArg,
    1999             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    2000             :   LLVMContext &Context) const {
    2001             :   // Replacing returns with sret/stack usage doesn't make sense for shaders.
    2002             :   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
    2003             :   // for shaders. Vector types should be explicitly handled by CC.
    2004       20295 :   if (AMDGPU::isEntryFunctionCC(CallConv))
    2005             :     return true;
    2006             : 
    2007             :   SmallVector<CCValAssign, 16> RVLocs;
    2008        4684 :   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
    2009        2342 :   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
    2010             : }
    2011             : 
    2012             : SDValue
    2013       19637 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    2014             :                               bool isVarArg,
    2015             :                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    2016             :                               const SmallVectorImpl<SDValue> &OutVals,
    2017             :                               const SDLoc &DL, SelectionDAG &DAG) const {
    2018       19637 :   MachineFunction &MF = DAG.getMachineFunction();
    2019       19637 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2020             : 
    2021             :   if (AMDGPU::isKernel(CallConv)) {
    2022             :     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
    2023       16187 :                                              OutVals, DL, DAG);
    2024             :   }
    2025             : 
    2026        3450 :   bool IsShader = AMDGPU::isShader(CallConv);
    2027             : 
    2028        3450 :   Info->setIfReturnsVoid(Outs.empty());
    2029        3450 :   bool IsWaveEnd = Info->returnsVoid() && IsShader;
    2030             : 
    2031             :   // CCValAssign - represent the assignment of the return value to a location.
    2032             :   SmallVector<CCValAssign, 48> RVLocs;
    2033             :   SmallVector<ISD::OutputArg, 48> Splits;
    2034             : 
    2035             :   // CCState - Info about the registers and stack slots.
    2036             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
    2037        6900 :                  *DAG.getContext());
    2038             : 
    2039             :   // Analyze outgoing return values.
    2040        3450 :   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
    2041             : 
    2042        3450 :   SDValue Flag;
    2043             :   SmallVector<SDValue, 48> RetOps;
    2044        3450 :   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    2045             : 
    2046             :   // Add return address for callable functions.
    2047        3450 :   if (!Info->isEntryFunction()) {
    2048        1713 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2049             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2050        3426 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2051             : 
    2052             :     // FIXME: Should be able to use a vreg here, but need a way to prevent it
    2053             :     // from being allcoated to a CSR.
    2054             : 
    2055             :     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2056        1713 :                                                 MVT::i64);
    2057             : 
    2058        1713 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
    2059        1713 :     Flag = Chain.getValue(1);
    2060             : 
    2061        1713 :     RetOps.push_back(PhysReturnAddrReg);
    2062             :   }
    2063             : 
    2064             :   // Copy the result values into the output registers.
    2065        9020 :   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
    2066             :        ++I, ++RealRVLocIdx) {
    2067        5570 :     CCValAssign &VA = RVLocs[I];
    2068             :     assert(VA.isRegLoc() && "Can only return in registers!");
    2069             :     // TODO: Partially return in registers if return values don't fit.
    2070        5570 :     SDValue Arg = OutVals[RealRVLocIdx];
    2071             : 
    2072             :     // Copied from other backends.
    2073        5570 :     switch (VA.getLocInfo()) {
    2074             :     case CCValAssign::Full:
    2075             :       break;
    2076             :     case CCValAssign::BCvt:
    2077           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2078           0 :       break;
    2079             :     case CCValAssign::SExt:
    2080           0 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2081           0 :       break;
    2082             :     case CCValAssign::ZExt:
    2083           0 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2084           0 :       break;
    2085             :     case CCValAssign::AExt:
    2086           6 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2087           6 :       break;
    2088           0 :     default:
    2089           0 :       llvm_unreachable("Unknown loc info!");
    2090             :     }
    2091             : 
    2092        5570 :     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
    2093        5570 :     Flag = Chain.getValue(1);
    2094        5570 :     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    2095             :   }
    2096             : 
    2097             :   // FIXME: Does sret work properly?
    2098        3450 :   if (!Info->isEntryFunction()) {
    2099        1713 :     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    2100             :     const MCPhysReg *I =
    2101        1713 :       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
    2102        1713 :     if (I) {
    2103           0 :       for (; *I; ++I) {
    2104           0 :         if (AMDGPU::SReg_64RegClass.contains(*I))
    2105           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
    2106           0 :         else if (AMDGPU::SReg_32RegClass.contains(*I))
    2107           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
    2108             :         else
    2109           0 :           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    2110             :       }
    2111             :     }
    2112             :   }
    2113             : 
    2114             :   // Update chain and glue.
    2115        3450 :   RetOps[0] = Chain;
    2116        3450 :   if (Flag.getNode())
    2117        2828 :     RetOps.push_back(Flag);
    2118             : 
    2119             :   unsigned Opc = AMDGPUISD::ENDPGM;
    2120        3450 :   if (!IsWaveEnd)
    2121        2828 :     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
    2122        3450 :   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
    2123             : }
    2124             : 
    2125         533 : SDValue SITargetLowering::LowerCallResult(
    2126             :     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
    2127             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    2128             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    2129             :     SDValue ThisVal) const {
    2130         533 :   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
    2131             : 
    2132             :   // Assign locations to each value returned by this call.
    2133             :   SmallVector<CCValAssign, 16> RVLocs;
    2134             :   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    2135         533 :                  *DAG.getContext());
    2136         533 :   CCInfo.AnalyzeCallResult(Ins, RetCC);
    2137             : 
    2138             :   // Copy all of the result registers out of their specified physreg.
    2139         872 :   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    2140         339 :     CCValAssign VA = RVLocs[i];
    2141         339 :     SDValue Val;
    2142             : 
    2143         339 :     if (VA.isRegLoc()) {
    2144         339 :       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
    2145         339 :       Chain = Val.getValue(1);
    2146         339 :       InFlag = Val.getValue(2);
    2147             :     } else if (VA.isMemLoc()) {
    2148           0 :       report_fatal_error("TODO: return values in memory");
    2149             :     } else
    2150             :       llvm_unreachable("unknown argument location type");
    2151             : 
    2152         339 :     switch (VA.getLocInfo()) {
    2153             :     case CCValAssign::Full:
    2154             :       break;
    2155             :     case CCValAssign::BCvt:
    2156           0 :       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
    2157           0 :       break;
    2158             :     case CCValAssign::ZExt:
    2159           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
    2160           7 :                         DAG.getValueType(VA.getValVT()));
    2161           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2162           7 :       break;
    2163             :     case CCValAssign::SExt:
    2164           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
    2165           7 :                         DAG.getValueType(VA.getValVT()));
    2166           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2167           7 :       break;
    2168             :     case CCValAssign::AExt:
    2169           3 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2170           3 :       break;
    2171           0 :     default:
    2172           0 :       llvm_unreachable("Unknown loc info!");
    2173             :     }
    2174             : 
    2175         339 :     InVals.push_back(Val);
    2176             :   }
    2177             : 
    2178         533 :   return Chain;
    2179             : }
    2180             : 
    2181             : // Add code to pass special inputs required depending on used features separate
    2182             : // from the explicit user arguments present in the IR.
    2183         575 : void SITargetLowering::passSpecialInputs(
    2184             :     CallLoweringInfo &CLI,
    2185             :     CCState &CCInfo,
    2186             :     const SIMachineFunctionInfo &Info,
    2187             :     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    2188             :     SmallVectorImpl<SDValue> &MemOpChains,
    2189             :     SDValue Chain) const {
    2190             :   // If we don't have a call site, this was a call inserted by
    2191             :   // legalization. These can never use special inputs.
    2192         575 :   if (!CLI.CS)
    2193           0 :     return;
    2194             : 
    2195             :   const Function *CalleeFunc = CLI.CS.getCalledFunction();
    2196             :   assert(CalleeFunc);
    2197             : 
    2198         575 :   SelectionDAG &DAG = CLI.DAG;
    2199         575 :   const SDLoc &DL = CLI.DL;
    2200             : 
    2201         575 :   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    2202             : 
    2203             :   auto &ArgUsageInfo =
    2204         575 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    2205             :   const AMDGPUFunctionArgInfo &CalleeArgInfo
    2206             :     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
    2207             : 
    2208             :   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
    2209             : 
    2210             :   // TODO: Unify with private memory register handling. This is complicated by
    2211             :   // the fact that at least in kernels, the input argument is not necessarily
    2212             :   // in the same location as the input.
    2213         575 :   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    2214             :     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    2215             :     AMDGPUFunctionArgInfo::QUEUE_PTR,
    2216             :     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
    2217             :     AMDGPUFunctionArgInfo::DISPATCH_ID,
    2218             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    2219             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    2220             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
    2221             :     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
    2222             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
    2223             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
    2224             :     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    2225             :   };
    2226             : 
    2227        6900 :   for (auto InputID : InputRegs) {
    2228             :     const ArgDescriptor *OutgoingArg;
    2229             :     const TargetRegisterClass *ArgRC;
    2230             : 
    2231        6325 :     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
    2232        6325 :     if (!OutgoingArg)
    2233        6204 :       continue;
    2234             : 
    2235             :     const ArgDescriptor *IncomingArg;
    2236             :     const TargetRegisterClass *IncomingArgRC;
    2237             :     std::tie(IncomingArg, IncomingArgRC)
    2238         121 :       = CallerArgInfo.getPreloadedValue(InputID);
    2239             :     assert(IncomingArgRC == ArgRC);
    2240             : 
    2241             :     // All special arguments are ints for now.
    2242         121 :     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
    2243         121 :     SDValue InputReg;
    2244             : 
    2245         121 :     if (IncomingArg) {
    2246         111 :       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
    2247             :     } else {
    2248             :       // The implicit arg ptr is special because it doesn't have a corresponding
    2249             :       // input for kernels, and is computed from the kernarg segment pointer.
    2250             :       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    2251          10 :       InputReg = getImplicitArgPtr(DAG, DL);
    2252             :     }
    2253             : 
    2254         121 :     if (OutgoingArg->isRegister()) {
    2255         111 :       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
    2256             :     } else {
    2257          10 :       unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
    2258             :       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
    2259          10 :                                               SpecialArgOffset);
    2260          10 :       MemOpChains.push_back(ArgStore);
    2261             :     }
    2262             :   }
    2263             : }
    2264             : 
    2265             : static bool canGuaranteeTCO(CallingConv::ID CC) {
    2266          39 :   return CC == CallingConv::Fast;
    2267             : }
    2268             : 
    2269             : /// Return true if we might ever do TCO for calls with this calling convention.
    2270             : static bool mayTailCallThisCC(CallingConv::ID CC) {
    2271          51 :   switch (CC) {
    2272             :   case CallingConv::C:
    2273             :     return true;
    2274             :   default:
    2275             :     return canGuaranteeTCO(CC);
    2276             :   }
    2277             : }
    2278             : 
    2279          51 : bool SITargetLowering::isEligibleForTailCallOptimization(
    2280             :     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    2281             :     const SmallVectorImpl<ISD::OutputArg> &Outs,
    2282             :     const SmallVectorImpl<SDValue> &OutVals,
    2283             :     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    2284          39 :   if (!mayTailCallThisCC(CalleeCC))
    2285             :     return false;
    2286             : 
    2287          51 :   MachineFunction &MF = DAG.getMachineFunction();
    2288          51 :   const Function &CallerF = MF.getFunction();
    2289             :   CallingConv::ID CallerCC = CallerF.getCallingConv();
    2290          51 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2291          51 :   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    2292             : 
    2293             :   // Kernels aren't callable, and don't have a live in return address so it
    2294             :   // doesn't make sense to do a tail call with entry functions.
    2295          51 :   if (!CallerPreserved)
    2296             :     return false;
    2297             : 
    2298             :   bool CCMatch = CallerCC == CalleeCC;
    2299             : 
    2300          48 :   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    2301           0 :     if (canGuaranteeTCO(CalleeCC) && CCMatch)
    2302           0 :       return true;
    2303             :     return false;
    2304             :   }
    2305             : 
    2306             :   // TODO: Can we handle var args?
    2307          48 :   if (IsVarArg)
    2308             :     return false;
    2309             : 
    2310         160 :   for (const Argument &Arg : CallerF.args()) {
    2311         115 :     if (Arg.hasByValAttr())
    2312             :       return false;
    2313             :   }
    2314             : 
    2315          45 :   LLVMContext &Ctx = *DAG.getContext();
    2316             : 
    2317             :   // Check that the call results are passed in the same way.
    2318          45 :   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
    2319             :                                   CCAssignFnForCall(CalleeCC, IsVarArg),
    2320             :                                   CCAssignFnForCall(CallerCC, IsVarArg)))
    2321             :     return false;
    2322             : 
    2323             :   // The callee has to preserve all registers the caller needs to preserve.
    2324          45 :   if (!CCMatch) {
    2325           0 :     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    2326           0 :     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    2327             :       return false;
    2328             :   }
    2329             : 
    2330             :   // Nothing more to check if the callee is taking no arguments.
    2331          45 :   if (Outs.empty())
    2332             :     return true;
    2333             : 
    2334             :   SmallVector<CCValAssign, 16> ArgLocs;
    2335          82 :   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
    2336             : 
    2337          41 :   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
    2338             : 
    2339          41 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    2340             :   // If the stack arguments for this call do not fit into our own save area then
    2341             :   // the call cannot be made tail.
    2342             :   // TODO: Is this really necessary?
    2343          41 :   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
    2344             :     return false;
    2345             : 
    2346          38 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
    2347          38 :   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
    2348             : }
    2349             : 
    2350          28 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
    2351          28 :   if (!CI->isTailCall())
    2352             :     return false;
    2353             : 
    2354           4 :   const Function *ParentFn = CI->getParent()->getParent();
    2355           4 :   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
    2356             :     return false;
    2357             : 
    2358           1 :   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
    2359           1 :   return (Attr.getValueAsString() != "true");
    2360             : }
    2361             : 
    2362             : // The wave scratch offset register is used as the global base pointer.
    2363         583 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    2364             :                                     SmallVectorImpl<SDValue> &InVals) const {
    2365         583 :   SelectionDAG &DAG = CLI.DAG;
    2366         583 :   const SDLoc &DL = CLI.DL;
    2367             :   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    2368             :   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    2369             :   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
    2370         583 :   SDValue Chain = CLI.Chain;
    2371         583 :   SDValue Callee = CLI.Callee;
    2372             :   bool &IsTailCall = CLI.IsTailCall;
    2373         583 :   CallingConv::ID CallConv = CLI.CallConv;
    2374         583 :   bool IsVarArg = CLI.IsVarArg;
    2375             :   bool IsSibCall = false;
    2376             :   bool IsThisReturn = false;
    2377         583 :   MachineFunction &MF = DAG.getMachineFunction();
    2378             : 
    2379         583 :   if (IsVarArg) {
    2380             :     return lowerUnhandledCall(CLI, InVals,
    2381           2 :                               "unsupported call to variadic function ");
    2382             :   }
    2383             : 
    2384         582 :   if (!CLI.CS.getInstruction())
    2385           1 :     report_fatal_error("unsupported libcall legalization");
    2386             : 
    2387             :   if (!CLI.CS.getCalledFunction()) {
    2388             :     return lowerUnhandledCall(CLI, InVals,
    2389           8 :                               "unsupported indirect call to function ");
    2390             :   }
    2391             : 
    2392         577 :   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
    2393             :     return lowerUnhandledCall(CLI, InVals,
    2394           2 :                               "unsupported required tail call to function ");
    2395             :   }
    2396             : 
    2397        1152 :   if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
    2398             :     // Note the issue is with the CC of the calling function, not of the call
    2399             :     // itself.
    2400             :     return lowerUnhandledCall(CLI, InVals,
    2401           2 :                           "unsupported call from graphics shader of function ");
    2402             :   }
    2403             : 
    2404             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2405         575 :   if (IsTailCall) {
    2406          51 :     IsTailCall = isEligibleForTailCallOptimization(
    2407             :       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
    2408          51 :     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
    2409           0 :       report_fatal_error("failed to perform tail call elimination on a call "
    2410             :                          "site marked musttail");
    2411             :     }
    2412             : 
    2413          51 :     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
    2414             : 
    2415             :     // A sibling call is one where we're under the usual C ABI and not planning
    2416             :     // to change that but can still do a tail call:
    2417          51 :     if (!TailCallOpt && IsTailCall)
    2418             :       IsSibCall = true;
    2419             : 
    2420             :     if (IsTailCall)
    2421             :       ++NumTailCalls;
    2422             :   }
    2423             : 
    2424         575 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2425             : 
    2426             :   // Analyze operands of the call, assigning locations to each operand.
    2427             :   SmallVector<CCValAssign, 16> ArgLocs;
    2428        1150 :   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    2429         575 :   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
    2430             : 
    2431             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2432         575 :   CCInfo.AllocateStack(4, 4);
    2433             : 
    2434         575 :   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
    2435             : 
    2436             :   // Get a count of how many bytes are to be pushed on the stack.
    2437         575 :   unsigned NumBytes = CCInfo.getNextStackOffset();
    2438             : 
    2439         575 :   if (IsSibCall) {
    2440             :     // Since we're not changing the ABI to make this a tail call, the memory
    2441             :     // operands are already available in the caller's incoming argument space.
    2442             :     NumBytes = 0;
    2443             :   }
    2444             : 
    2445             :   // FPDiff is the byte offset of the call's argument area from the callee's.
    2446             :   // Stores to callee stack arguments will be placed in FixedStackSlots offset
    2447             :   // by this amount for a tail call. In a sibling call it must be 0 because the
    2448             :   // caller will deallocate the entire stack and the callee still expects its
    2449             :   // arguments to begin at SP+0. Completely unused for non-tail calls.
    2450             :   int32_t FPDiff = 0;
    2451         575 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    2452             :   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    2453             : 
    2454         575 :   SDValue CallerSavedFP;
    2455             : 
    2456             :   // Adjust the stack pointer for the new arguments...
    2457             :   // These operations are automatically eliminated by the prolog/epilog pass
    2458         575 :   if (!IsSibCall) {
    2459         533 :     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
    2460             : 
    2461         533 :     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
    2462             : 
    2463             :     // In the HSA case, this should be an identity copy.
    2464             :     SDValue ScratchRSrcReg
    2465         533 :       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
    2466         533 :     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
    2467             : 
    2468             :     // TODO: Don't hardcode these registers and get from the callee function.
    2469             :     SDValue ScratchWaveOffsetReg
    2470         533 :       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
    2471         533 :     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
    2472             : 
    2473         533 :     if (!Info->isEntryFunction()) {
    2474             :       // Avoid clobbering this function's FP value. In the current convention
    2475             :       // callee will overwrite this, so do save/restore around the call site.
    2476         104 :       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
    2477         104 :                                          Info->getFrameOffsetReg(), MVT::i32);
    2478             :     }
    2479             :   }
    2480             : 
    2481             :   SmallVector<SDValue, 8> MemOpChains;
    2482             :   MVT PtrVT = MVT::i32;
    2483             : 
    2484             :   // Walk the register/memloc assignments, inserting copies/loads.
    2485        2648 :   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
    2486             :        ++i, ++realArgIdx) {
    2487        2073 :     CCValAssign &VA = ArgLocs[i];
    2488        2073 :     SDValue Arg = OutVals[realArgIdx];
    2489             : 
    2490             :     // Promote the value if needed.
    2491        2073 :     switch (VA.getLocInfo()) {
    2492             :     case CCValAssign::Full:
    2493             :       break;
    2494             :     case CCValAssign::BCvt:
    2495           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2496           0 :       break;
    2497             :     case CCValAssign::ZExt:
    2498          10 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2499          10 :       break;
    2500             :     case CCValAssign::SExt:
    2501          10 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2502          10 :       break;
    2503             :     case CCValAssign::AExt:
    2504           4 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2505           4 :       break;
    2506             :     case CCValAssign::FPExt:
    2507           0 :       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
    2508           0 :       break;
    2509           0 :     default:
    2510           0 :       llvm_unreachable("Unknown loc info!");
    2511             :     }
    2512             : 
    2513        2073 :     if (VA.isRegLoc()) {
    2514        3982 :       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    2515             :     } else {
    2516             :       assert(VA.isMemLoc());
    2517             : 
    2518          82 :       SDValue DstAddr;
    2519             :       MachinePointerInfo DstInfo;
    2520             : 
    2521          82 :       unsigned LocMemOffset = VA.getLocMemOffset();
    2522          82 :       int32_t Offset = LocMemOffset;
    2523             : 
    2524          82 :       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
    2525             :       unsigned Align = 0;
    2526             : 
    2527          82 :       if (IsTailCall) {
    2528          35 :         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
    2529          35 :         unsigned OpSize = Flags.isByVal() ?
    2530          35 :           Flags.getByValSize() : VA.getValVT().getStoreSize();
    2531             : 
    2532             :         // FIXME: We can have better than the minimum byval required alignment.
    2533          35 :         Align = Flags.isByVal() ? Flags.getByValAlign() :
    2534             :           MinAlign(Subtarget->getStackAlignment(), Offset);
    2535             : 
    2536             :         Offset = Offset + FPDiff;
    2537          35 :         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
    2538             : 
    2539          35 :         DstAddr = DAG.getFrameIndex(FI, PtrVT);
    2540          35 :         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
    2541             : 
    2542             :         // Make sure any stack arguments overlapping with where we're storing
    2543             :         // are loaded before this eventual operation. Otherwise they'll be
    2544             :         // clobbered.
    2545             : 
    2546             :         // FIXME: Why is this really necessary? This seems to just result in a
    2547             :         // lot of code to copy the stack and write them back to the same
    2548             :         // locations, which are supposed to be immutable?
    2549          35 :         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
    2550             :       } else {
    2551          47 :         DstAddr = PtrOff;
    2552          47 :         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
    2553          47 :         Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
    2554             :       }
    2555             : 
    2556          82 :       if (Outs[i].Flags.isByVal()) {
    2557             :         SDValue SizeNode =
    2558          40 :             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
    2559             :         SDValue Cpy = DAG.getMemcpy(
    2560             :             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
    2561             :             /*isVol = */ false, /*AlwaysInline = */ true,
    2562             :             /*isTailCall = */ false, DstInfo,
    2563          40 :             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
    2564          80 :                 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
    2565             : 
    2566          40 :         MemOpChains.push_back(Cpy);
    2567             :       } else {
    2568          42 :         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
    2569          42 :         MemOpChains.push_back(Store);
    2570             :       }
    2571             :     }
    2572             :   }
    2573             : 
    2574             :   // Copy special input registers after user input arguments.
    2575         575 :   passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
    2576             : 
    2577         575 :   if (!MemOpChains.empty())
    2578          62 :     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    2579             : 
    2580             :   // Build a sequence of copy-to-reg nodes chained together with token chain
    2581             :   // and flag operands which copy the outgoing args into the appropriate regs.
    2582         575 :   SDValue InFlag;
    2583        3743 :   for (auto &RegToPass : RegsToPass) {
    2584        3168 :     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
    2585        3168 :                              RegToPass.second, InFlag);
    2586        3168 :     InFlag = Chain.getValue(1);
    2587             :   }
    2588             : 
    2589             : 
    2590         575 :   SDValue PhysReturnAddrReg;
    2591         575 :   if (IsTailCall) {
    2592             :     // Since the return is being combined with the call, we need to pass on the
    2593             :     // return address.
    2594             : 
    2595          42 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2596             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2597          84 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2598             : 
    2599          42 :     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2600          42 :                                         MVT::i64);
    2601          42 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
    2602          42 :     InFlag = Chain.getValue(1);
    2603             :   }
    2604             : 
    2605             :   // We don't usually want to end the call-sequence here because we would tidy
    2606             :   // the frame up *after* the call, however in the ABI-changing tail-call case
    2607             :   // we've carefully laid out the parameters so that when sp is reset they'll be
    2608             :   // in the correct location.
    2609         575 :   if (IsTailCall && !IsSibCall) {
    2610           0 :     Chain = DAG.getCALLSEQ_END(Chain,
    2611             :                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2612             :                                DAG.getTargetConstant(0, DL, MVT::i32),
    2613           0 :                                InFlag, DL);
    2614           0 :     InFlag = Chain.getValue(1);
    2615             :   }
    2616             : 
    2617             :   std::vector<SDValue> Ops;
    2618         575 :   Ops.push_back(Chain);
    2619         575 :   Ops.push_back(Callee);
    2620             : 
    2621         575 :   if (IsTailCall) {
    2622             :     // Each tail call may have to adjust the stack by a different amount, so
    2623             :     // this information must travel along with the operation for eventual
    2624             :     // consumption by emitEpilogue.
    2625          42 :     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
    2626             : 
    2627          42 :     Ops.push_back(PhysReturnAddrReg);
    2628             :   }
    2629             : 
    2630             :   // Add argument registers to the end of the list so that they are known live
    2631             :   // into the call.
    2632        3743 :   for (auto &RegToPass : RegsToPass) {
    2633        3168 :     Ops.push_back(DAG.getRegister(RegToPass.first,
    2634        6336 :                                   RegToPass.second.getValueType()));
    2635             :   }
    2636             : 
    2637             :   // Add a register mask operand representing the call-preserved registers.
    2638             : 
    2639         575 :   auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
    2640         575 :   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
    2641             :   assert(Mask && "Missing call preserved mask for calling convention");
    2642         575 :   Ops.push_back(DAG.getRegisterMask(Mask));
    2643             : 
    2644         575 :   if (InFlag.getNode())
    2645         575 :     Ops.push_back(InFlag);
    2646             : 
    2647         575 :   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    2648             : 
    2649             :   // If we're doing a tall call, use a TC_RETURN here rather than an
    2650             :   // actual call instruction.
    2651         575 :   if (IsTailCall) {
    2652             :     MFI.setHasTailCall();
    2653          42 :     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
    2654             :   }
    2655             : 
    2656             :   // Returns a chain and a flag for retval copy to use.
    2657         533 :   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
    2658         533 :   Chain = Call.getValue(0);
    2659         533 :   InFlag = Call.getValue(1);
    2660             : 
    2661         533 :   if (CallerSavedFP) {
    2662         104 :     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
    2663         104 :     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
    2664         104 :     InFlag = Chain.getValue(1);
    2665             :   }
    2666             : 
    2667         533 :   uint64_t CalleePopBytes = NumBytes;
    2668         533 :   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
    2669             :                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
    2670         533 :                              InFlag, DL);
    2671         533 :   if (!Ins.empty())
    2672         117 :     InFlag = Chain.getValue(1);
    2673             : 
    2674             :   // Handle result values, copying them out of physregs into vregs that we
    2675             :   // return.
    2676             :   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
    2677             :                          InVals, IsThisReturn,
    2678         533 :                          IsThisReturn ? OutVals[0] : SDValue());
    2679             : }
    2680             : 
    2681         131 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
    2682             :                                              SelectionDAG &DAG) const {
    2683           0 :   unsigned Reg = StringSwitch<unsigned>(RegName)
    2684         131 :     .Case("m0", AMDGPU::M0)
    2685         131 :     .Case("exec", AMDGPU::EXEC)
    2686         131 :     .Case("exec_lo", AMDGPU::EXEC_LO)
    2687         131 :     .Case("exec_hi", AMDGPU::EXEC_HI)
    2688         131 :     .Case("flat_scratch", AMDGPU::FLAT_SCR)
    2689         131 :     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
    2690         131 :     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
    2691             :     .Default(AMDGPU::NoRegister);
    2692             : 
    2693         131 :   if (Reg == AMDGPU::NoRegister) {
    2694           0 :     report_fatal_error(Twine("invalid register name \""
    2695             :                              + StringRef(RegName)  + "\"."));
    2696             : 
    2697             :   }
    2698             : 
    2699         162 :   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
    2700          31 :       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
    2701           1 :     report_fatal_error(Twine("invalid register \""
    2702             :                              + StringRef(RegName)  + "\" for subtarget."));
    2703             :   }
    2704             : 
    2705             :   switch (Reg) {
    2706          17 :   case AMDGPU::M0:
    2707             :   case AMDGPU::EXEC_LO:
    2708             :   case AMDGPU::EXEC_HI:
    2709             :   case AMDGPU::FLAT_SCR_LO:
    2710             :   case AMDGPU::FLAT_SCR_HI:
    2711          17 :     if (VT.getSizeInBits() == 32)
    2712             :       return Reg;
    2713             :     break;
    2714         113 :   case AMDGPU::EXEC:
    2715             :   case AMDGPU::FLAT_SCR:
    2716         113 :     if (VT.getSizeInBits() == 64)
    2717             :       return Reg;
    2718             :     break;
    2719           0 :   default:
    2720           0 :     llvm_unreachable("missing register type checking");
    2721             :   }
    2722             : 
    2723           2 :   report_fatal_error(Twine("invalid type for register \""
    2724             :                            + StringRef(RegName) + "\"."));
    2725             : }
    2726             : 
    2727             : // If kill is not the last instruction, split the block so kill is always a
    2728             : // proper terminator.
    2729          84 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
    2730             :                                                     MachineBasicBlock *BB) const {
    2731          84 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2732             : 
    2733             :   MachineBasicBlock::iterator SplitPoint(&MI);
    2734             :   ++SplitPoint;
    2735             : 
    2736          84 :   if (SplitPoint == BB->end()) {
    2737             :     // Don't bother with a new block.
    2738           8 :     MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2739           4 :     return BB;
    2740             :   }
    2741             : 
    2742          80 :   MachineFunction *MF = BB->getParent();
    2743             :   MachineBasicBlock *SplitBB
    2744          80 :     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
    2745             : 
    2746             :   MF->insert(++MachineFunction::iterator(BB), SplitBB);
    2747             :   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
    2748             : 
    2749          80 :   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
    2750          80 :   BB->addSuccessor(SplitBB);
    2751             : 
    2752         160 :   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2753          80 :   return SplitBB;
    2754             : }
    2755             : 
    2756             : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
    2757             : // wavefront. If the value is uniform and just happens to be in a VGPR, this
    2758             : // will only do one iteration. In the worst case, this will loop 64 times.
    2759             : //
    2760             : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
    2761          32 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
    2762             :   const SIInstrInfo *TII,
    2763             :   MachineRegisterInfo &MRI,
    2764             :   MachineBasicBlock &OrigBB,
    2765             :   MachineBasicBlock &LoopBB,
    2766             :   const DebugLoc &DL,
    2767             :   const MachineOperand &IdxReg,
    2768             :   unsigned InitReg,
    2769             :   unsigned ResultReg,
    2770             :   unsigned PhiReg,
    2771             :   unsigned InitSaveExecReg,
    2772             :   int Offset,
    2773             :   bool UseGPRIdxMode,
    2774             :   bool IsIndirectSrc) {
    2775          32 :   MachineBasicBlock::iterator I = LoopBB.begin();
    2776             : 
    2777          32 :   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2778          32 :   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2779          32 :   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2780          32 :   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2781             : 
    2782          32 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
    2783          32 :     .addReg(InitReg)
    2784             :     .addMBB(&OrigBB)
    2785          32 :     .addReg(ResultReg)
    2786             :     .addMBB(&LoopBB);
    2787             : 
    2788          32 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
    2789          32 :     .addReg(InitSaveExecReg)
    2790             :     .addMBB(&OrigBB)
    2791          32 :     .addReg(NewExec)
    2792             :     .addMBB(&LoopBB);
    2793             : 
    2794             :   // Read the next variant <- also loop target.
    2795          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
    2796          32 :     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    2797             : 
    2798             :   // Compare the just read M0 value to all possible Idx values.
    2799          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
    2800          32 :     .addReg(CurrentIdxReg)
    2801          32 :     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
    2802             : 
    2803             :   // Update EXEC, save the original EXEC value to VCC.
    2804          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
    2805          32 :     .addReg(CondReg, RegState::Kill);
    2806             : 
    2807             :   MRI.setSimpleHint(NewExec, CondReg);
    2808             : 
    2809          32 :   if (UseGPRIdxMode) {
    2810             :     unsigned IdxReg;
    2811          16 :     if (Offset == 0) {
    2812             :       IdxReg = CurrentIdxReg;
    2813             :     } else {
    2814           6 :       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2815          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
    2816           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2817           6 :         .addImm(Offset);
    2818             :     }
    2819          16 :     unsigned IdxMode = IsIndirectSrc ?
    2820             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2821             :     MachineInstr *SetOn =
    2822          32 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2823          16 :       .addReg(IdxReg, RegState::Kill)
    2824          16 :       .addImm(IdxMode);
    2825          16 :     SetOn->getOperand(3).setIsUndef();
    2826             :   } else {
    2827             :     // Move index from VCC into M0
    2828          16 :     if (Offset == 0) {
    2829          30 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2830          10 :         .addReg(CurrentIdxReg, RegState::Kill);
    2831             :     } else {
    2832          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2833           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2834           6 :         .addImm(Offset);
    2835             :     }
    2836             :   }
    2837             : 
    2838             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    2839             :   MachineInstr *InsertPt =
    2840          64 :     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    2841          32 :     .addReg(AMDGPU::EXEC)
    2842          32 :     .addReg(NewExec);
    2843             : 
    2844             :   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
    2845             :   // s_cbranch_scc0?
    2846             : 
    2847             :   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
    2848          64 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    2849             :     .addMBB(&LoopBB);
    2850             : 
    2851          32 :   return InsertPt->getIterator();
    2852             : }
    2853             : 
    2854             : // This has slightly sub-optimal regalloc when the source vector is killed by
    2855             : // the read. The register allocator does not understand that the kill is
    2856             : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
    2857             : // subregister from it, using 1 more VGPR than necessary. This was saved when
    2858             : // this was expanded after register allocation.
    2859          32 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    2860             :                                                   MachineBasicBlock &MBB,
    2861             :                                                   MachineInstr &MI,
    2862             :                                                   unsigned InitResultReg,
    2863             :                                                   unsigned PhiReg,
    2864             :                                                   int Offset,
    2865             :                                                   bool UseGPRIdxMode,
    2866             :                                                   bool IsIndirectSrc) {
    2867          32 :   MachineFunction *MF = MBB.getParent();
    2868          32 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2869             :   const DebugLoc &DL = MI.getDebugLoc();
    2870             :   MachineBasicBlock::iterator I(&MI);
    2871             : 
    2872          32 :   unsigned DstReg = MI.getOperand(0).getReg();
    2873          32 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2874          32 :   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2875             : 
    2876          64 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
    2877             : 
    2878             :   // Save the EXEC mask
    2879          64 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
    2880          32 :     .addReg(AMDGPU::EXEC);
    2881             : 
    2882             :   // To insert the loop we need to split the block. Move everything after this
    2883             :   // point to a new block, and insert a new empty block between the two.
    2884          32 :   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    2885          32 :   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    2886             :   MachineFunction::iterator MBBI(MBB);
    2887             :   ++MBBI;
    2888             : 
    2889             :   MF->insert(MBBI, LoopBB);
    2890             :   MF->insert(MBBI, RemainderBB);
    2891             : 
    2892          32 :   LoopBB->addSuccessor(LoopBB);
    2893          32 :   LoopBB->addSuccessor(RemainderBB);
    2894             : 
    2895             :   // Move the rest of the block into a new block.
    2896          32 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    2897             :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    2898             : 
    2899          32 :   MBB.addSuccessor(LoopBB);
    2900             : 
    2901          32 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2902             : 
    2903             :   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
    2904             :                                       InitResultReg, DstReg, PhiReg, TmpExec,
    2905          32 :                                       Offset, UseGPRIdxMode, IsIndirectSrc);
    2906             : 
    2907          32 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    2908          64 :   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    2909          32 :     .addReg(SaveExec);
    2910             : 
    2911          32 :   return InsPt;
    2912             : }
    2913             : 
    2914             : // Returns subreg index, offset
    2915             : static std::pair<unsigned, int>
    2916           0 : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
    2917             :                             const TargetRegisterClass *SuperRC,
    2918             :                             unsigned VecReg,
    2919             :                             int Offset) {
    2920           0 :   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
    2921             : 
    2922             :   // Skip out of bounds offsets, or else we would end up using an undefined
    2923             :   // register.
    2924           0 :   if (Offset >= NumElts || Offset < 0)
    2925           0 :     return std::make_pair(AMDGPU::sub0, Offset);
    2926             : 
    2927           0 :   return std::make_pair(AMDGPU::sub0 + Offset, 0);
    2928             : }
    2929             : 
    2930             : // Return true if the index is an SGPR and was set.
    2931         161 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
    2932             :                                  MachineRegisterInfo &MRI,
    2933             :                                  MachineInstr &MI,
    2934             :                                  int Offset,
    2935             :                                  bool UseGPRIdxMode,
    2936             :                                  bool IsIndirectSrc) {
    2937         161 :   MachineBasicBlock *MBB = MI.getParent();
    2938             :   const DebugLoc &DL = MI.getDebugLoc();
    2939             :   MachineBasicBlock::iterator I(&MI);
    2940             : 
    2941         161 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2942         161 :   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
    2943             : 
    2944             :   assert(Idx->getReg() != AMDGPU::NoRegister);
    2945             : 
    2946         161 :   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
    2947             :     return false;
    2948             : 
    2949         129 :   if (UseGPRIdxMode) {
    2950          31 :     unsigned IdxMode = IsIndirectSrc ?
    2951             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2952          31 :     if (Offset == 0) {
    2953             :       MachineInstr *SetOn =
    2954          34 :           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2955             :               .add(*Idx)
    2956          17 :               .addImm(IdxMode);
    2957             : 
    2958          17 :       SetOn->getOperand(3).setIsUndef();
    2959             :     } else {
    2960          14 :       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    2961          28 :       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
    2962             :           .add(*Idx)
    2963          14 :           .addImm(Offset);
    2964             :       MachineInstr *SetOn =
    2965          28 :         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2966          14 :         .addReg(Tmp, RegState::Kill)
    2967          14 :         .addImm(IdxMode);
    2968             : 
    2969          14 :       SetOn->getOperand(3).setIsUndef();
    2970             :     }
    2971             : 
    2972          31 :     return true;
    2973             :   }
    2974             : 
    2975          98 :   if (Offset == 0) {
    2976         252 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2977             :       .add(*Idx);
    2978             :   } else {
    2979          28 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2980             :       .add(*Idx)
    2981          14 :       .addImm(Offset);
    2982             :   }
    2983             : 
    2984             :   return true;
    2985             : }
    2986             : 
    2987             : // Control flow needs to be inserted if indexing with a VGPR.
    2988          71 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
    2989             :                                           MachineBasicBlock &MBB,
    2990             :                                           const GCNSubtarget &ST) {
    2991          71 :   const SIInstrInfo *TII = ST.getInstrInfo();
    2992             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2993          71 :   MachineFunction *MF = MBB.getParent();
    2994          71 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2995             : 
    2996          71 :   unsigned Dst = MI.getOperand(0).getReg();
    2997          71 :   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
    2998          71 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2999             : 
    3000             :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
    3001             : 
    3002             :   unsigned SubReg;
    3003             :   std::tie(SubReg, Offset)
    3004          71 :     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
    3005             : 
    3006             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    3007             : 
    3008          71 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
    3009             :     MachineBasicBlock::iterator I(&MI);
    3010             :     const DebugLoc &DL = MI.getDebugLoc();
    3011             : 
    3012          59 :     if (UseGPRIdxMode) {
    3013             :       // TODO: Look at the uses to avoid the copy. This may require rescheduling
    3014             :       // to avoid interfering with other uses, so probably requires a new
    3015             :       // optimization pass.
    3016          34 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    3017          17 :         .addReg(SrcReg, RegState::Undef, SubReg)
    3018          17 :         .addReg(SrcReg, RegState::Implicit)
    3019          17 :         .addReg(AMDGPU::M0, RegState::Implicit);
    3020          34 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3021             :     } else {
    3022         126 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    3023          42 :         .addReg(SrcReg, RegState::Undef, SubReg)
    3024          42 :         .addReg(SrcReg, RegState::Implicit);
    3025             :     }
    3026             : 
    3027          59 :     MI.eraseFromParent();
    3028             : 
    3029             :     return &MBB;
    3030             :   }
    3031             : 
    3032             :   const DebugLoc &DL = MI.getDebugLoc();
    3033             :   MachineBasicBlock::iterator I(&MI);
    3034             : 
    3035          12 :   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3036          12 :   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3037             : 
    3038          24 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
    3039             : 
    3040             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
    3041          12 :                               Offset, UseGPRIdxMode, true);
    3042          12 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3043             : 
    3044          12 :   if (UseGPRIdxMode) {
    3045          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    3046           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3047           6 :       .addReg(SrcReg, RegState::Implicit)
    3048           6 :       .addReg(AMDGPU::M0, RegState::Implicit);
    3049          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3050             :   } else {
    3051          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    3052           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3053           6 :       .addReg(SrcReg, RegState::Implicit);
    3054             :   }
    3055             : 
    3056          12 :   MI.eraseFromParent();
    3057             : 
    3058          12 :   return LoopBB;
    3059             : }
    3060             : 
    3061          66 : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
    3062             :                                  const TargetRegisterClass *VecRC) {
    3063          66 :   switch (TRI.getRegSizeInBits(*VecRC)) {
    3064             :   case 32: // 4 bytes
    3065             :     return AMDGPU::V_MOVRELD_B32_V1;
    3066           6 :   case 64: // 8 bytes
    3067           6 :     return AMDGPU::V_MOVRELD_B32_V2;
    3068          38 :   case 128: // 16 bytes
    3069          38 :     return AMDGPU::V_MOVRELD_B32_V4;
    3070          16 :   case 256: // 32 bytes
    3071          16 :     return AMDGPU::V_MOVRELD_B32_V8;
    3072           6 :   case 512: // 64 bytes
    3073           6 :     return AMDGPU::V_MOVRELD_B32_V16;
    3074           0 :   default:
    3075           0 :     llvm_unreachable("unsupported size for MOVRELD pseudos");
    3076             :   }
    3077             : }
    3078             : 
    3079          90 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
    3080             :                                           MachineBasicBlock &MBB,
    3081             :                                           const GCNSubtarget &ST) {
    3082          90 :   const SIInstrInfo *TII = ST.getInstrInfo();
    3083             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    3084          90 :   MachineFunction *MF = MBB.getParent();
    3085          90 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    3086             : 
    3087          90 :   unsigned Dst = MI.getOperand(0).getReg();
    3088          90 :   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    3089          90 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    3090          90 :   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    3091          90 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    3092          90 :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
    3093             : 
    3094             :   // This can be an immediate, but will be folded later.
    3095             :   assert(Val->getReg());
    3096             : 
    3097             :   unsigned SubReg;
    3098          90 :   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
    3099             :                                                          SrcVec->getReg(),
    3100          90 :                                                          Offset);
    3101             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    3102             : 
    3103          90 :   if (Idx->getReg() == AMDGPU::NoRegister) {
    3104             :     MachineBasicBlock::iterator I(&MI);
    3105             :     const DebugLoc &DL = MI.getDebugLoc();
    3106             : 
    3107             :     assert(Offset == 0);
    3108             : 
    3109           0 :     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
    3110             :         .add(*SrcVec)
    3111             :         .add(*Val)
    3112           0 :         .addImm(SubReg);
    3113             : 
    3114           0 :     MI.eraseFromParent();
    3115             :     return &MBB;
    3116             :   }
    3117             : 
    3118          90 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
    3119             :     MachineBasicBlock::iterator I(&MI);
    3120             :     const DebugLoc &DL = MI.getDebugLoc();
    3121             : 
    3122          70 :     if (UseGPRIdxMode) {
    3123          28 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3124          14 :           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
    3125             :           .add(*Val)
    3126          14 :           .addReg(Dst, RegState::ImplicitDefine)
    3127          14 :           .addReg(SrcVec->getReg(), RegState::Implicit)
    3128          14 :           .addReg(AMDGPU::M0, RegState::Implicit);
    3129             : 
    3130          28 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3131             :     } else {
    3132          56 :       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3133             : 
    3134         112 :       BuildMI(MBB, I, DL, MovRelDesc)
    3135          56 :           .addReg(Dst, RegState::Define)
    3136          56 :           .addReg(SrcVec->getReg())
    3137             :           .add(*Val)
    3138          56 :           .addImm(SubReg - AMDGPU::sub0);
    3139             :     }
    3140             : 
    3141          70 :     MI.eraseFromParent();
    3142             :     return &MBB;
    3143             :   }
    3144             : 
    3145          20 :   if (Val->isReg())
    3146          20 :     MRI.clearKillFlags(Val->getReg());
    3147             : 
    3148             :   const DebugLoc &DL = MI.getDebugLoc();
    3149             : 
    3150          20 :   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
    3151             : 
    3152             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
    3153          20 :                               Offset, UseGPRIdxMode, false);
    3154          20 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3155             : 
    3156          20 :   if (UseGPRIdxMode) {
    3157          20 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3158          10 :         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
    3159             :         .add(*Val)                               // src0
    3160          10 :         .addReg(Dst, RegState::ImplicitDefine)
    3161          10 :         .addReg(PhiReg, RegState::Implicit)
    3162          10 :         .addReg(AMDGPU::M0, RegState::Implicit);
    3163          20 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3164             :   } else {
    3165          10 :     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3166             : 
    3167          20 :     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
    3168          10 :         .addReg(Dst, RegState::Define)
    3169          10 :         .addReg(PhiReg)
    3170             :         .add(*Val)
    3171          10 :         .addImm(SubReg - AMDGPU::sub0);
    3172             :   }
    3173             : 
    3174          20 :   MI.eraseFromParent();
    3175             : 
    3176          20 :   return LoopBB;
    3177             : }
    3178             : 
    3179       14237 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    3180             :   MachineInstr &MI, MachineBasicBlock *BB) const {
    3181             : 
    3182       14237 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3183       14237 :   MachineFunction *MF = BB->getParent();
    3184       14237 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    3185             : 
    3186       14237 :   if (TII->isMIMG(MI)) {
    3187         737 :     if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
    3188           0 :       report_fatal_error("missing mem operand from MIMG instruction");
    3189             :     }
    3190             :     // Add a memoperand for mimg instructions so that they aren't assumed to
    3191             :     // be ordered memory instuctions.
    3192             : 
    3193         737 :     return BB;
    3194             :   }
    3195             : 
    3196       13500 :   switch (MI.getOpcode()) {
    3197        2285 :   case AMDGPU::S_ADD_U64_PSEUDO:
    3198             :   case AMDGPU::S_SUB_U64_PSEUDO: {
    3199        2285 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3200             :     const DebugLoc &DL = MI.getDebugLoc();
    3201             : 
    3202        2285 :     MachineOperand &Dest = MI.getOperand(0);
    3203             :     MachineOperand &Src0 = MI.getOperand(1);
    3204             :     MachineOperand &Src1 = MI.getOperand(2);
    3205             : 
    3206        2285 :     unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3207        2285 :     unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3208             : 
    3209             :     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3210             :      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3211        2285 :      &AMDGPU::SReg_32_XM0RegClass);
    3212             :     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3213             :       Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3214        2285 :       &AMDGPU::SReg_32_XM0RegClass);
    3215             : 
    3216             :     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3217             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3218        2285 :       &AMDGPU::SReg_32_XM0RegClass);
    3219             :     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3220             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3221        2285 :       &AMDGPU::SReg_32_XM0RegClass);
    3222             : 
    3223        2285 :     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    3224             : 
    3225        2285 :     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
    3226        2285 :     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
    3227        4570 :     BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
    3228             :       .add(Src0Sub0)
    3229             :       .add(Src1Sub0);
    3230        4570 :     BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
    3231             :       .add(Src0Sub1)
    3232             :       .add(Src1Sub1);
    3233        4570 :     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
    3234        2285 :       .addReg(DestSub0)
    3235             :       .addImm(AMDGPU::sub0)
    3236        2285 :       .addReg(DestSub1)
    3237             :       .addImm(AMDGPU::sub1);
    3238        2285 :     MI.eraseFromParent();
    3239             :     return BB;
    3240             :   }
    3241        9134 :   case AMDGPU::SI_INIT_M0: {
    3242        9134 :     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
    3243       18268 :             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    3244        9134 :         .add(MI.getOperand(0));
    3245        9134 :     MI.eraseFromParent();
    3246        9134 :     return BB;
    3247             :   }
    3248           3 :   case AMDGPU::SI_INIT_EXEC:
    3249             :     // This should be before all vector instructions.
    3250             :     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    3251           3 :             AMDGPU::EXEC)
    3252           3 :         .addImm(MI.getOperand(0).getImm());
    3253           3 :     MI.eraseFromParent();
    3254           3 :     return BB;
    3255             : 
    3256             :   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    3257             :     // Extract the thread count from an SGPR input and set EXEC accordingly.
    3258             :     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    3259             :     //
    3260             :     // S_BFE_U32 count, input, {shift, 7}
    3261             :     // S_BFM_B64 exec, count, 0
    3262             :     // S_CMP_EQ_U32 count, 64
    3263             :     // S_CMOV_B64 exec, -1
    3264             :     MachineInstr *FirstMI = &*BB->begin();
    3265           4 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3266           4 :     unsigned InputReg = MI.getOperand(0).getReg();
    3267           4 :     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3268             :     bool Found = false;
    3269             : 
    3270             :     // Move the COPY of the input reg to the beginning, so that we can use it.
    3271          10 :     for (auto I = BB->begin(); I != &MI; I++) {
    3272          20 :       if (I->getOpcode() != TargetOpcode::COPY ||
    3273          10 :           I->getOperand(0).getReg() != InputReg)
    3274             :         continue;
    3275             : 
    3276           4 :       if (I == FirstMI) {
    3277             :         FirstMI = &*++BB->begin();
    3278             :       } else {
    3279           4 :         I->removeFromParent();
    3280             :         BB->insert(FirstMI, &*I);
    3281             :       }
    3282             :       Found = true;
    3283             :       break;
    3284             :     }
    3285             :     assert(Found);
    3286             :     (void)Found;
    3287             : 
    3288             :     // This should be before all vector instructions.
    3289          12 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
    3290           4 :         .addReg(InputReg)
    3291           4 :         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
    3292           8 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
    3293           4 :             AMDGPU::EXEC)
    3294           4 :         .addReg(CountReg)
    3295             :         .addImm(0);
    3296          12 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
    3297           4 :         .addReg(CountReg, RegState::Kill)
    3298             :         .addImm(64);
    3299           4 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
    3300           4 :             AMDGPU::EXEC)
    3301             :         .addImm(-1);
    3302           4 :     MI.eraseFromParent();
    3303           4 :     return BB;
    3304             :   }
    3305             : 
    3306             :   case AMDGPU::GET_GROUPSTATICSIZE: {
    3307             :     DebugLoc DL = MI.getDebugLoc();
    3308         122 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
    3309          61 :         .add(MI.getOperand(0))
    3310          61 :         .addImm(MFI->getLDSSize());
    3311          61 :     MI.eraseFromParent();
    3312             :     return BB;
    3313             :   }
    3314          71 :   case AMDGPU::SI_INDIRECT_SRC_V1:
    3315             :   case AMDGPU::SI_INDIRECT_SRC_V2:
    3316             :   case AMDGPU::SI_INDIRECT_SRC_V4:
    3317             :   case AMDGPU::SI_INDIRECT_SRC_V8:
    3318             :   case AMDGPU::SI_INDIRECT_SRC_V16:
    3319          71 :     return emitIndirectSrc(MI, *BB, *getSubtarget());
    3320          90 :   case AMDGPU::SI_INDIRECT_DST_V1:
    3321             :   case AMDGPU::SI_INDIRECT_DST_V2:
    3322             :   case AMDGPU::SI_INDIRECT_DST_V4:
    3323             :   case AMDGPU::SI_INDIRECT_DST_V8:
    3324             :   case AMDGPU::SI_INDIRECT_DST_V16:
    3325          90 :     return emitIndirectDst(MI, *BB, *getSubtarget());
    3326          84 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    3327             :   case AMDGPU::SI_KILL_I1_PSEUDO:
    3328          84 :     return splitKillBlock(MI, BB);
    3329          49 :   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    3330          49 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3331             : 
    3332          49 :     unsigned Dst = MI.getOperand(0).getReg();
    3333          49 :     unsigned Src0 = MI.getOperand(1).getReg();
    3334          49 :     unsigned Src1 = MI.getOperand(2).getReg();
    3335             :     const DebugLoc &DL = MI.getDebugLoc();
    3336          49 :     unsigned SrcCond = MI.getOperand(3).getReg();
    3337             : 
    3338          49 :     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3339          49 :     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3340          49 :     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    3341             : 
    3342          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
    3343          49 :       .addReg(SrcCond);
    3344          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    3345          49 :       .addReg(Src0, 0, AMDGPU::sub0)
    3346          49 :       .addReg(Src1, 0, AMDGPU::sub0)
    3347          49 :       .addReg(SrcCondCopy);
    3348          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    3349          49 :       .addReg(Src0, 0, AMDGPU::sub1)
    3350          49 :       .addReg(Src1, 0, AMDGPU::sub1)
    3351          49 :       .addReg(SrcCondCopy);
    3352             : 
    3353          98 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
    3354          49 :       .addReg(DstLo)
    3355             :       .addImm(AMDGPU::sub0)
    3356          49 :       .addReg(DstHi)
    3357             :       .addImm(AMDGPU::sub1);
    3358          49 :     MI.eraseFromParent();
    3359          49 :     return BB;
    3360             :   }
    3361          78 :   case AMDGPU::SI_BR_UNDEF: {
    3362          78 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3363             :     const DebugLoc &DL = MI.getDebugLoc();
    3364         156 :     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
    3365          78 :                            .add(MI.getOperand(0));
    3366          78 :     Br->getOperand(1).setIsUndef(true); // read undef SCC
    3367          78 :     MI.eraseFromParent();
    3368          78 :     return BB;
    3369             :   }
    3370        1066 :   case AMDGPU::ADJCALLSTACKUP:
    3371             :   case AMDGPU::ADJCALLSTACKDOWN: {
    3372        1066 :     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    3373             :     MachineInstrBuilder MIB(*MF, &MI);
    3374             : 
    3375             :     // Add an implicit use of the frame offset reg to prevent the restore copy
    3376             :     // inserted after the call from being reorderd after stack operations in the
    3377             :     // the caller's frame.
    3378        1066 :     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
    3379        1066 :         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
    3380        1066 :         .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
    3381             :     return BB;
    3382             :   }
    3383         575 :   case AMDGPU::SI_CALL_ISEL:
    3384             :   case AMDGPU::SI_TCRETURN_ISEL: {
    3385         575 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3386             :     const DebugLoc &DL = MI.getDebugLoc();
    3387         575 :     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
    3388             : 
    3389         575 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3390         575 :     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
    3391         575 :     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
    3392             :     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
    3393             : 
    3394         575 :     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
    3395             : 
    3396             :     MachineInstrBuilder MIB;
    3397        1150 :     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
    3398        1066 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
    3399         533 :         .add(MI.getOperand(0))
    3400             :         .addGlobalAddress(G);
    3401             :     } else {
    3402          84 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
    3403          42 :         .add(MI.getOperand(0))
    3404             :         .addGlobalAddress(G);
    3405             : 
    3406             :       // There is an additional imm operand for tcreturn, but it should be in the
    3407             :       // right place already.
    3408             :     }
    3409             : 
    3410        4741 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
    3411        4166 :       MIB.add(MI.getOperand(I));
    3412             : 
    3413             :     MIB.cloneMemRefs(MI);
    3414         575 :     MI.eraseFromParent();
    3415             :     return BB;
    3416             :   }
    3417           0 :   default:
    3418           0 :     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    3419             :   }
    3420             : }
    3421             : 
    3422       29044 : bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
    3423       29044 :   return isTypeLegal(VT.getScalarType());
    3424             : }
    3425             : 
    3426        4434 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    3427             :   // This currently forces unfolding various combinations of fsub into fma with
    3428             :   // free fneg'd operands. As long as we have fast FMA (controlled by
    3429             :   // isFMAFasterThanFMulAndFAdd), we should perform these.
    3430             : 
    3431             :   // When fma is quarter rate, for f64 where add / sub are at best half rate,
    3432             :   // most of these combines appear to be cycle neutral but save on instruction
    3433             :   // count / code size.
    3434        4434 :   return true;
    3435             : }
    3436             : 
    3437       14419 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
    3438             :                                          EVT VT) const {
    3439       14419 :   if (!VT.isVector()) {
    3440       14302 :     return MVT::i1;
    3441             :   }
    3442         117 :   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
    3443             : }
    3444             : 
    3445      147959 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
    3446             :   // TODO: Should i16 be used always if legal? For now it would force VALU
    3447             :   // shifts.
    3448      147959 :   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
    3449             : }
    3450             : 
    3451             : // Answering this is somewhat tricky and depends on the specific device which
    3452             : // have different rates for fma or all f64 operations.
    3453             : //
    3454             : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
    3455             : // regardless of which device (although the number of cycles differs between
    3456             : // devices), so it is always profitable for f64.
    3457             : //
    3458             : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
    3459             : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
    3460             : // which we can always do even without fused FP ops since it returns the same
    3461             : // result as the separate operations and since it is always full
    3462             : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
    3463             : // however does not support denormals, so we do report fma as faster if we have
    3464             : // a fast fma device and require denormals.
    3465             : //
    3466       12661 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    3467       12661 :   VT = VT.getScalarType();
    3468             : 
    3469       12661 :   switch (VT.getSimpleVT().SimpleTy) {
    3470        9778 :   case MVT::f32: {
    3471             :     // This is as fast on some subtargets. However, we always have full rate f32
    3472             :     // mad available which returns the same result as the separate operations
    3473             :     // which we should prefer over fma. We can't use this if we want to support
    3474             :     // denormals, so only report this in these cases.
    3475        9778 :     if (Subtarget->hasFP32Denormals())
    3476         628 :       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
    3477             : 
    3478             :     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
    3479        9150 :     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
    3480             :   }
    3481             :   case MVT::f64:
    3482             :     return true;
    3483        1842 :   case MVT::f16:
    3484        1842 :     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
    3485             :   default:
    3486             :     break;
    3487             :   }
    3488             : 
    3489           0 :   return false;
    3490             : }
    3491             : 
    3492             : //===----------------------------------------------------------------------===//
    3493             : // Custom DAG Lowering Operations
    3494             : //===----------------------------------------------------------------------===//
    3495             : 
    3496             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3497             : // wider vector type is legal.
    3498          13 : SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
    3499             :                                              SelectionDAG &DAG) const {
    3500             :   unsigned Opc = Op.getOpcode();
    3501          13 :   EVT VT = Op.getValueType();
    3502             :   assert(VT == MVT::v4f16);
    3503             : 
    3504             :   SDValue Lo, Hi;
    3505          13 :   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3506             : 
    3507             :   SDLoc SL(Op);
    3508             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
    3509          26 :                              Op->getFlags());
    3510             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
    3511          26 :                              Op->getFlags());
    3512             : 
    3513          26 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3514             : }
    3515             : 
    3516             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3517             : // wider vector type is legal.
    3518         125 : SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
    3519             :                                               SelectionDAG &DAG) const {
    3520             :   unsigned Opc = Op.getOpcode();
    3521         125 :   EVT VT = Op.getValueType();
    3522             :   assert(VT == MVT::v4i16 || VT == MVT::v4f16);
    3523             : 
    3524             :   SDValue Lo0, Hi0;
    3525         125 :   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3526             :   SDValue Lo1, Hi1;
    3527         125 :   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
    3528             : 
    3529             :   SDLoc SL(Op);
    3530             : 
    3531             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
    3532         250 :                              Op->getFlags());
    3533             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
    3534         250 :                              Op->getFlags());
    3535             : 
    3536         250 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3537             : }
    3538             : 
    3539      219926 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    3540      439852 :   switch (Op.getOpcode()) {
    3541       24175 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    3542        1753 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    3543       74303 :   case ISD::LOAD: {
    3544       74303 :     SDValue Result = LowerLOAD(Op, DAG);
    3545             :     assert((!Result.getNode() ||
    3546             :             Result.getNode()->getNumValues() == 2) &&
    3547             :            "Load should return a value and a chain");
    3548       74303 :     return Result;
    3549             :   }
    3550             : 
    3551          98 :   case ISD::FSIN:
    3552             :   case ISD::FCOS:
    3553          98 :     return LowerTrig(Op, DAG);
    3554         734 :   case ISD::SELECT: return LowerSELECT(Op, DAG);
    3555         251 :   case ISD::FDIV: return LowerFDIV(Op, DAG);
    3556         263 :   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
    3557       82361 :   case ISD::STORE: return LowerSTORE(Op, DAG);
    3558        1026 :   case ISD::GlobalAddress: {
    3559        1026 :     MachineFunction &MF = DAG.getMachineFunction();
    3560        1026 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    3561        1026 :     return LowerGlobalAddress(MFI, Op, DAG);
    3562             :   }
    3563       21890 :   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    3564        1617 :   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
    3565        2731 :   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
    3566          46 :   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
    3567         162 :   case ISD::INSERT_VECTOR_ELT:
    3568         162 :     return lowerINSERT_VECTOR_ELT(Op, DAG);
    3569        6568 :   case ISD::EXTRACT_VECTOR_ELT:
    3570        6568 :     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
    3571        1255 :   case ISD::BUILD_VECTOR:
    3572        1255 :     return lowerBUILD_VECTOR(Op, DAG);
    3573         519 :   case ISD::FP_ROUND:
    3574         519 :     return lowerFP_ROUND(Op, DAG);
    3575          27 :   case ISD::TRAP:
    3576          27 :     return lowerTRAP(Op, DAG);
    3577           9 :   case ISD::DEBUGTRAP:
    3578           9 :     return lowerDEBUGTRAP(Op, DAG);
    3579          13 :   case ISD::FABS:
    3580             :   case ISD::FNEG:
    3581             :   case ISD::FCANONICALIZE:
    3582          13 :     return splitUnaryVectorOp(Op, DAG);
    3583         125 :   case ISD::SHL:
    3584             :   case ISD::SRA:
    3585             :   case ISD::SRL:
    3586             :   case ISD::ADD:
    3587             :   case ISD::SUB:
    3588             :   case ISD::MUL:
    3589             :   case ISD::SMIN:
    3590             :   case ISD::SMAX:
    3591             :   case ISD::UMIN:
    3592             :   case ISD::UMAX:
    3593             :   case ISD::FMINNUM:
    3594             :   case ISD::FMAXNUM:
    3595             :   case ISD::FADD:
    3596             :   case ISD::FMUL:
    3597         125 :     return splitBinaryVectorOp(Op, DAG);
    3598             :   }
    3599             :   return SDValue();
    3600             : }
    3601             : 
    3602          45 : static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
    3603             :                                        const SDLoc &DL,
    3604             :                                        SelectionDAG &DAG, bool Unpacked) {
    3605          45 :   if (!LoadVT.isVector())
    3606          12 :     return Result;
    3607             : 
    3608          33 :   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
    3609             :     // Truncate to v2i16/v4i16.
    3610          19 :     EVT IntLoadVT = LoadVT.changeTypeToInteger();
    3611             : 
    3612             :     // Workaround legalizer not scalarizing truncate after vector op
    3613             :     // legalization byt not creating intermediate vector trunc.
    3614             :     SmallVector<SDValue, 4> Elts;
    3615          19 :     DAG.ExtractVectorElements(Result, Elts);
    3616          77 :     for (SDValue &Elt : Elts)
    3617          58 :       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
    3618             : 
    3619          19 :     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
    3620             : 
    3621             :     // Bitcast to original type (v2f16/v4f16).
    3622          19 :     return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3623             :   }
    3624             : 
    3625             :   // Cast back to the original packed type.
    3626          14 :   return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3627             : }
    3628             : 
    3629          54 : SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
    3630             :                                               MemSDNode *M,
    3631             :                                               SelectionDAG &DAG,
    3632             :                                               ArrayRef<SDValue> Ops,
    3633             :                                               bool IsIntrinsic) const {
    3634             :   SDLoc DL(M);
    3635             : 
    3636          54 :   bool Unpacked = Subtarget->hasUnpackedD16VMem();
    3637          54 :   EVT LoadVT = M->getValueType(0);
    3638             : 
    3639          54 :   EVT EquivLoadVT = LoadVT;
    3640          72 :   if (Unpacked && LoadVT.isVector()) {
    3641             :     EquivLoadVT = LoadVT.isVector() ?
    3642          12 :       EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    3643          12 :                        LoadVT.getVectorNumElements()) : LoadVT;
    3644             :   }
    3645             : 
    3646             :   // Change from v4f16/v2f16 to EquivLoadVT.
    3647          54 :   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
    3648             : 
    3649             :   SDValue Load
    3650             :     = DAG.getMemIntrinsicNode(
    3651             :       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
    3652             :       VTList, Ops, M->getMemoryVT(),
    3653         108 :       M->getMemOperand());
    3654          54 :   if (!Unpacked) // Just adjusted the opcode.
    3655          36 :     return Load;
    3656             : 
    3657          18 :   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
    3658             : 
    3659          36 :   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
    3660             : }
    3661             : 
    3662          72 : static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
    3663             :                                   SDNode *N, SelectionDAG &DAG) {
    3664          72 :   EVT VT = N->getValueType(0);
    3665          72 :   const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
    3666             :   if (!CD)
    3667           8 :     return DAG.getUNDEF(VT);
    3668             : 
    3669          64 :   int CondCode = CD->getSExtValue();
    3670          64 :   if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
    3671             :       CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
    3672           4 :     return DAG.getUNDEF(VT);
    3673             : 
    3674             :   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
    3675             : 
    3676             : 
    3677          60 :   SDValue LHS = N->getOperand(1);
    3678          60 :   SDValue RHS = N->getOperand(2);
    3679             : 
    3680             :   SDLoc DL(N);
    3681             : 
    3682             :   EVT CmpVT = LHS.getValueType();
    3683             :   if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
    3684          10 :     unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
    3685             :       ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    3686          10 :     LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
    3687          10 :     RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
    3688             :   }
    3689             : 
    3690          60 :   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
    3691             : 
    3692             :   return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
    3693          60 :                      DAG.getCondCode(CCOpcode));
    3694             : }
    3695             : 
    3696          86 : static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
    3697             :                                   SDNode *N, SelectionDAG &DAG) {
    3698          86 :   EVT VT = N->getValueType(0);
    3699          86 :   const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
    3700             :   if (!CD)
    3701           2 :     return DAG.getUNDEF(VT);
    3702             : 
    3703          84 :   int CondCode = CD->getSExtValue();
    3704          84 :   if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
    3705             :       CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
    3706           4 :     return DAG.getUNDEF(VT);
    3707             :   }
    3708             : 
    3709          80 :   SDValue Src0 = N->getOperand(1);
    3710          80 :   SDValue Src1 = N->getOperand(2);
    3711             :   EVT CmpVT = Src0.getValueType();
    3712             :   SDLoc SL(N);
    3713             : 
    3714             :   if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
    3715          14 :     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    3716          14 :     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    3717             :   }
    3718             : 
    3719             :   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
    3720          80 :   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
    3721             :   return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
    3722          80 :                      Src1, DAG.getCondCode(CCOpcode));
    3723             : }
    3724             : 
    3725         556 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
    3726             :                                           SmallVectorImpl<SDValue> &Results,
    3727             :                                           SelectionDAG &DAG) const {
    3728        1112 :   switch (N->getOpcode()) {
    3729             :   case ISD::INSERT_VECTOR_ELT: {
    3730          67 :     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
    3731          23 :       Results.push_back(Res);
    3732          67 :     return;
    3733             :   }
    3734             :   case ISD::EXTRACT_VECTOR_ELT: {
    3735           0 :     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
    3736           0 :       Results.push_back(Res);
    3737           0 :     return;
    3738             :   }
    3739          85 :   case ISD::INTRINSIC_WO_CHAIN: {
    3740         255 :     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    3741             :     switch (IID) {
    3742          29 :     case Intrinsic::amdgcn_cvt_pkrtz: {
    3743          29 :       SDValue Src0 = N->getOperand(1);
    3744          29 :       SDValue Src1 = N->getOperand(2);
    3745             :       SDLoc SL(N);
    3746             :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
    3747          29 :                                 Src0, Src1);
    3748          58 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
    3749             :       return;
    3750             :     }
    3751          56 :     case Intrinsic::amdgcn_cvt_pknorm_i16:
    3752             :     case Intrinsic::amdgcn_cvt_pknorm_u16:
    3753             :     case Intrinsic::amdgcn_cvt_pk_i16:
    3754             :     case Intrinsic::amdgcn_cvt_pk_u16: {
    3755          56 :       SDValue Src0 = N->getOperand(1);
    3756          56 :       SDValue Src1 = N->getOperand(2);
    3757             :       SDLoc SL(N);
    3758             :       unsigned Opcode;
    3759             : 
    3760          56 :       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
    3761             :         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    3762          38 :       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
    3763             :         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    3764          20 :       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
    3765             :         Opcode = AMDGPUISD::CVT_PK_I16_I32;
    3766             :       else
    3767             :         Opcode = AMDGPUISD::CVT_PK_U16_U32;
    3768             : 
    3769         112 :       EVT VT = N->getValueType(0);
    3770             :       if (isTypeLegal(VT))
    3771           0 :         Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
    3772             :       else {
    3773          56 :         SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
    3774         112 :         Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
    3775             :       }
    3776             :       return;
    3777             :     }
    3778             :     }
    3779             :     break;
    3780             :   }
    3781             :   case ISD::INTRINSIC_W_CHAIN: {
    3782           0 :     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
    3783           0 :       Results.push_back(Res);
    3784           0 :       Results.push_back(Res.getValue(1));
    3785           0 :       return;
    3786             :     }
    3787             : 
    3788           0 :     break;
    3789             :   }
    3790             :   case ISD::SELECT: {
    3791             :     SDLoc SL(N);
    3792          26 :     EVT VT = N->getValueType(0);
    3793          26 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    3794          52 :     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
    3795          52 :     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
    3796             : 
    3797          26 :     EVT SelectVT = NewVT;
    3798          26 :     if (NewVT.bitsLT(MVT::i32)) {
    3799           2 :       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
    3800           2 :       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
    3801             :       SelectVT = MVT::i32;
    3802             :     }
    3803             : 
    3804             :     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
    3805          52 :                                     N->getOperand(0), LHS, RHS);
    3806             : 
    3807           0 :     if (NewVT != SelectVT)
    3808           2 :       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
    3809          52 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
    3810             :     return;
    3811             :   }
    3812             :   case ISD::FNEG: {
    3813           6 :     if (N->getValueType(0) != MVT::v2f16)
    3814             :       break;
    3815             : 
    3816             :     SDLoc SL(N);
    3817          10 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3818             : 
    3819             :     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
    3820             :                              BC,
    3821           5 :                              DAG.getConstant(0x80008000, SL, MVT::i32));
    3822          10 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3823             :     return;
    3824             :   }
    3825             :   case ISD::FABS: {
    3826           8 :     if (N->getValueType(0) != MVT::v2f16)
    3827             :       break;
    3828             : 
    3829             :     SDLoc SL(N);
    3830          12 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3831             : 
    3832             :     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
    3833             :                              BC,
    3834           6 :                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
    3835          12 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3836             :     return;
    3837             :   }
    3838             :   default:
    3839             :     break;
    3840             :   }
    3841             : }
    3842             : 
    3843             : /// Helper function for LowerBRCOND
    3844           0 : static SDNode *findUser(SDValue Value, unsigned Opcode) {
    3845             : 
    3846             :   SDNode *Parent = Value.getNode();
    3847         853 :   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    3848        1906 :        I != E; ++I) {
    3849             : 
    3850           0 :     if (I.getUse().get() != Value)
    3851           0 :       continue;
    3852             : 
    3853         850 :     if (I->getOpcode() == Opcode)
    3854             :       return *I;
    3855             :   }
    3856             :   return nullptr;
    3857             : }
    3858             : 
    3859        1753 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    3860        1753 :   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    3861        1806 :     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
    3862             :     case Intrinsic::amdgcn_if:
    3863             :       return AMDGPUISD::IF;
    3864          52 :     case Intrinsic::amdgcn_else:
    3865          52 :       return AMDGPUISD::ELSE;
    3866          75 :     case Intrinsic::amdgcn_loop:
    3867          75 :       return AMDGPUISD::LOOP;
    3868             :     case Intrinsic::amdgcn_end_cf:
    3869             :       llvm_unreachable("should not occur");
    3870           2 :     default:
    3871           2 :       return 0;
    3872             :     }
    3873             :   }
    3874             : 
    3875             :   // break, if_break, else_break are all only used as inputs to loop, not
    3876             :   // directly as branch conditions.
    3877             :   return 0;
    3878             : }
    3879             : 
    3880           4 : void SITargetLowering::createDebuggerPrologueStackObjects(
    3881             :     MachineFunction &MF) const {
    3882             :   // Create stack objects that are used for emitting debugger prologue.
    3883             :   //
    3884             :   // Debugger prologue writes work group IDs and work item IDs to scratch memory
    3885             :   // at fixed location in the following format:
    3886             :   //   offset 0:  work group ID x
    3887             :   //   offset 4:  work group ID y
    3888             :   //   offset 8:  work group ID z
    3889             :   //   offset 16: work item ID x
    3890             :   //   offset 20: work item ID y
    3891             :   //   offset 24: work item ID z
    3892           4 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3893             :   int ObjectIdx = 0;
    3894             : 
    3895             :   // For each dimension:
    3896          16 :   for (unsigned i = 0; i < 3; ++i) {
    3897             :     // Create fixed stack object for work group ID.
    3898          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
    3899             :     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
    3900             :     // Create fixed stack object for work item ID.
    3901          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
    3902             :     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
    3903             :   }
    3904           4 : }
    3905             : 
    3906        1338 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    3907        1338 :   const Triple &TT = getTargetMachine().getTargetTriple();
    3908        1235 :   return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
    3909        1441 :           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
    3910         103 :          AMDGPU::shouldEmitConstantsToTextSection(TT);
    3911             : }
    3912             : 
    3913         694 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    3914         629 :   return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
    3915         557 :           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
    3916         137 :           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
    3917         842 :          !shouldEmitFixup(GV) &&
    3918          76 :          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
    3919             : }
    3920             : 
    3921         588 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
    3922         588 :   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
    3923             : }
    3924             : 
    3925             : /// This transforms the control flow intrinsics to get the branch destination as
    3926             : /// last parameter, also switches branch target with BR if the need arise
    3927        1753 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    3928             :                                       SelectionDAG &DAG) const {
    3929             :   SDLoc DL(BRCOND);
    3930             : 
    3931        1753 :   SDNode *Intr = BRCOND.getOperand(1).getNode();
    3932        1753 :   SDValue Target = BRCOND.getOperand(2);
    3933             :   SDNode *BR = nullptr;
    3934             :   SDNode *SetCC = nullptr;
    3935             : 
    3936        1753 :   if (Intr->getOpcode() == ISD::SETCC) {
    3937             :     // As long as we negate the condition everything is fine
    3938             :     SetCC = Intr;
    3939        1425 :     Intr = SetCC->getOperand(0).getNode();
    3940             : 
    3941             :   } else {
    3942             :     // Get the target from BR if we don't negate the condition
    3943             :     BR = findUser(BRCOND, ISD::BR);
    3944         328 :     Target = BR->getOperand(1);
    3945             :   }
    3946             : 
    3947             :   // FIXME: This changes the types of the intrinsics instead of introducing new
    3948             :   // nodes with the correct types.
    3949             :   // e.g. llvm.amdgcn.loop
    3950             : 
    3951             :   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
    3952             :   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
    3953             : 
    3954        1753 :   unsigned CFNode = isCFIntrinsic(Intr);
    3955        1753 :   if (CFNode == 0) {
    3956             :     // This is a uniform branch so we don't need to legalize.
    3957        1153 :     return BRCOND;
    3958             :   }
    3959             : 
    3960        1200 :   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
    3961             :                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
    3962             : 
    3963             :   assert(!SetCC ||
    3964             :         (SetCC->getConstantOperandVal(1) == 1 &&
    3965             :          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    3966             :                                                              ISD::SETNE));
    3967             : 
    3968             :   // operands of the new intrinsic call
    3969             :   SmallVector<SDValue, 4> Ops;
    3970         600 :   if (HaveChain)
    3971         600 :     Ops.push_back(BRCOND.getOperand(0));
    3972             : 
    3973        1200 :   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
    3974         600 :   Ops.push_back(Target);
    3975             : 
    3976        1200 :   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
    3977             : 
    3978             :   // build the new intrinsic call
    3979         600 :   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
    3980             : 
    3981         600 :   if (!HaveChain) {
    3982             :     SDValue Ops[] =  {
    3983             :       SDValue(Result, 0),
    3984             :       BRCOND.getOperand(0)
    3985           0 :     };
    3986             : 
    3987           0 :     Result = DAG.getMergeValues(Ops, DL).getNode();
    3988             :   }
    3989             : 
    3990         600 :   if (BR) {
    3991             :     // Give the branch instruction our target
    3992             :     SDValue Ops[] = {
    3993          90 :       BR->getOperand(0),
    3994             :       BRCOND.getOperand(2)
    3995         180 :     };
    3996         180 :     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
    3997          90 :     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
    3998             :     BR = NewBR.getNode();
    3999             :   }
    4000             : 
    4001         600 :   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    4002             : 
    4003             :   // Copy the intrinsic results to registers
    4004        1725 :   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    4005             :     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    4006         525 :     if (!CopyToReg)
    4007             :       continue;
    4008             : 
    4009         522 :     Chain = DAG.getCopyToReg(
    4010             :       Chain, DL,
    4011         522 :       CopyToReg->getOperand(1),
    4012             :       SDValue(Result, i - 1),
    4013        1044 :       SDValue());
    4014             : 
    4015        1044 :     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    4016             :   }
    4017             : 
    4018             :   // Remove the old intrinsic from the chain
    4019        1200 :   DAG.ReplaceAllUsesOfValueWith(
    4020         600 :     SDValue(Intr, Intr->getNumValues() - 1),
    4021         600 :     Intr->getOperand(0));
    4022             : 
    4023         600 :   return Chain;
    4024             : }
    4025             : 
    4026        2526 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
    4027             :                                             SDValue Op,
    4028             :                                             const SDLoc &DL,
    4029             :                                             EVT VT) const {
    4030        2526 :   return Op.getValueType().bitsLE(VT) ?
    4031        2526 :       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
    4032        2526 :       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
    4033             : }
    4034             : 
    4035         519 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
    4036             :   assert(Op.getValueType() == MVT::f16 &&
    4037             :          "Do not know how to custom lower FP_ROUND for non-f16 type");
    4038             : 
    4039         519 :   SDValue Src = Op.getOperand(0);
    4040             :   EVT SrcVT = Src.getValueType();
    4041             :   if (SrcVT != MVT::f64)
    4042         509 :     return Op;
    4043             : 
    4044             :   SDLoc DL(Op);
    4045             : 
    4046          10 :   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
    4047          10 :   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
    4048          10 :   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
    4049             : }
    4050             : 
    4051          27 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
    4052             :   SDLoc SL(Op);
    4053          27 :   SDValue Chain = Op.getOperand(0);
    4054             : 
    4055          27 :   if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
    4056          12 :       !Subtarget->isTrapHandlerEnabled())
    4057          21 :     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
    4058             : 
    4059           6 :   MachineFunction &MF = DAG.getMachineFunction();
    4060           6 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4061             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4062             :   assert(UserSGPR != AMDGPU::NoRegister);
    4063             :   SDValue QueuePtr = CreateLiveInRegister(
    4064          12 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4065           6 :   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
    4066             :   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
    4067           6 :                                    QueuePtr, SDValue());
    4068             :   SDValue Ops[] = {
    4069             :     ToReg,
    4070           6 :     DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
    4071             :     SGPR01,
    4072           6 :     ToReg.getValue(1)
    4073           6 :   };
    4074           6 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4075             : }
    4076             : 
    4077           9 : SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
    4078             :   SDLoc SL(Op);
    4079           9 :   SDValue Chain = Op.getOperand(0);
    4080           9 :   MachineFunction &MF = DAG.getMachineFunction();
    4081             : 
    4082           9 :   if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
    4083           4 :       !Subtarget->isTrapHandlerEnabled()) {
    4084             :     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
    4085             :                                      "debugtrap handler not supported",
    4086             :                                      Op.getDebugLoc(),
    4087           7 :                                      DS_Warning);
    4088           7 :     LLVMContext &Ctx = MF.getFunction().getContext();
    4089           7 :     Ctx.diagnose(NoTrap);
    4090           7 :     return Chain;
    4091             :   }
    4092             : 
    4093             :   SDValue Ops[] = {
    4094             :     Chain,
    4095           2 :     DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
    4096           2 :   };
    4097           2 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4098             : }
    4099             : 
    4100          32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
    4101             :                                              SelectionDAG &DAG) const {
    4102             :   // FIXME: Use inline constants (src_{shared, private}_base) instead.
    4103          32 :   if (Subtarget->hasApertureRegs()) {
    4104          12 :     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
    4105             :         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
    4106             :         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
    4107             :     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
    4108             :         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
    4109             :         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
    4110          12 :     unsigned Encoding =
    4111             :         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
    4112          12 :         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
    4113             :         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
    4114             : 
    4115          12 :     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
    4116             :     SDValue ApertureReg = SDValue(
    4117          12 :         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
    4118          12 :     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
    4119          12 :     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
    4120             :   }
    4121             : 
    4122          20 :   MachineFunction &MF = DAG.getMachineFunction();
    4123          20 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4124             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4125             :   assert(UserSGPR != AMDGPU::NoRegister);
    4126             : 
    4127             :   SDValue QueuePtr = CreateLiveInRegister(
    4128          40 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4129             : 
    4130             :   // Offset into amd_queue_t for group_segment_aperture_base_hi /
    4131             :   // private_segment_aperture_base_hi.
    4132          20 :   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
    4133             : 
    4134          20 :   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
    4135             : 
    4136             :   // TODO: Use custom target PseudoSourceValue.
    4137             :   // TODO: We should use the value from the IR intrinsic call, but it might not
    4138             :   // be available and how do we get it?
    4139          20 :   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
    4140             :                                               AMDGPUAS::CONSTANT_ADDRESS));
    4141             : 
    4142             :   MachinePointerInfo PtrInfo(V, StructOffset);
    4143             :   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
    4144             :                      MinAlign(64, StructOffset),
    4145             :                      MachineMemOperand::MODereferenceable |
    4146          40 :                          MachineMemOperand::MOInvariant);
    4147             : }
    4148             : 
    4149          46 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
    4150             :                                              SelectionDAG &DAG) const {
    4151             :   SDLoc SL(Op);
    4152             :   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
    4153             : 
    4154          46 :   SDValue Src = ASC->getOperand(0);
    4155          46 :   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
    4156             : 
    4157             :   const AMDGPUTargetMachine &TM =
    4158             :     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
    4159             : 
    4160             :   // flat -> local/private
    4161          46 :   if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
    4162          12 :     unsigned DestAS = ASC->getDestAddressSpace();
    4163             : 
    4164          24 :     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
    4165          12 :         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
    4166             :       unsigned NullVal = TM.getNullPointerValue(DestAS);
    4167          12 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4168          12 :       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
    4169          12 :       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
    4170             : 
    4171             :       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
    4172          12 :                          NonNull, Ptr, SegmentNullPtr);
    4173             :     }
    4174             :   }
    4175             : 
    4176             :   // local/private -> flat
    4177          34 :   if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
    4178             :     unsigned SrcAS = ASC->getSrcAddressSpace();
    4179             : 
    4180          66 :     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
    4181          33 :         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
    4182             :       unsigned NullVal = TM.getNullPointerValue(SrcAS);
    4183          32 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4184             : 
    4185             :       SDValue NonNull
    4186          32 :         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
    4187             : 
    4188          32 :       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
    4189             :       SDValue CvtPtr
    4190          32 :         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
    4191             : 
    4192             :       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
    4193             :                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
    4194          32 :                          FlatNullPtr);
    4195             :     }
    4196             :   }
    4197             : 
    4198             :   // global <-> flat are no-ops and never emitted.
    4199             : 
    4200           2 :   const MachineFunction &MF = DAG.getMachineFunction();
    4201             :   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
    4202           2 :     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
    4203           2 :   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
    4204             : 
    4205           4 :   return DAG.getUNDEF(ASC->getValueType(0));
    4206             : }
    4207             : 
    4208         229 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    4209             :                                                  SelectionDAG &DAG) const {
    4210         229 :   SDValue Vec = Op.getOperand(0);
    4211         229 :   SDValue InsVal = Op.getOperand(1);
    4212         229 :   SDValue Idx = Op.getOperand(2);
    4213         229 :   EVT VecVT = Vec.getValueType();
    4214         229 :   EVT EltVT = VecVT.getVectorElementType();
    4215         229 :   unsigned VecSize = VecVT.getSizeInBits();
    4216         229 :   unsigned EltSize = EltVT.getSizeInBits();
    4217             : 
    4218             : 
    4219             :   assert(VecSize <= 64);
    4220             : 
    4221             :   unsigned NumElts = VecVT.getVectorNumElements();
    4222             :   SDLoc SL(Op);
    4223             :   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
    4224             : 
    4225         229 :   if (NumElts == 4 && EltSize == 16 && KIdx) {
    4226          20 :     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
    4227             : 
    4228             :     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4229          20 :                                  DAG.getConstant(0, SL, MVT::i32));
    4230             :     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4231          20 :                                  DAG.getConstant(1, SL, MVT::i32));
    4232             : 
    4233          20 :     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
    4234          20 :     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
    4235             : 
    4236          40 :     unsigned Idx = KIdx->getZExtValue();
    4237             :     bool InsertLo = Idx < 2;
    4238             :     SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
    4239          20 :       InsertLo ? LoVec : HiVec,
    4240             :       DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
    4241          47 :       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
    4242             : 
    4243          20 :     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
    4244             : 
    4245             :     SDValue Concat = InsertLo ?
    4246          14 :       DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
    4247          27 :       DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
    4248             : 
    4249          20 :     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
    4250             :   }
    4251             : 
    4252             :   if (isa<ConstantSDNode>(Idx))
    4253         164 :     return SDValue();
    4254             : 
    4255          45 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4256             : 
    4257             :   // Avoid stack access for dynamic indexing.
    4258          45 :   SDValue Val = InsVal;
    4259             :   if (InsVal.getValueType() == MVT::f16)
    4260          21 :       Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
    4261             : 
    4262             :   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
    4263          45 :   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
    4264             : 
    4265             :   assert(isPowerOf2_32(EltSize));
    4266          45 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4267             : 
    4268             :   // Convert vector index to bit-index.
    4269          45 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4270             : 
    4271          45 :   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4272             :   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
    4273             :                             DAG.getConstant(0xffff, SL, IntVT),
    4274          45 :                             ScaledIdx);
    4275             : 
    4276          45 :   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
    4277             :   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
    4278          45 :                             DAG.getNOT(SL, BFM, IntVT), BCVec);
    4279             : 
    4280          45 :   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
    4281          45 :   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
    4282             : }
    4283             : 
    4284        6568 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    4285             :                                                   SelectionDAG &DAG) const {
    4286             :   SDLoc SL(Op);
    4287             : 
    4288        6568 :   EVT ResultVT = Op.getValueType();
    4289        6568 :   SDValue Vec = Op.getOperand(0);
    4290        6568 :   SDValue Idx = Op.getOperand(1);
    4291        6568 :   EVT VecVT = Vec.getValueType();
    4292        6568 :   unsigned VecSize = VecVT.getSizeInBits();
    4293        6568 :   EVT EltVT = VecVT.getVectorElementType();
    4294             :   assert(VecSize <= 64);
    4295             : 
    4296             :   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
    4297             : 
    4298             :   // Make sure we do any optimizations that will make it easier to fold
    4299             :   // source modifiers before obscuring it with bit operations.
    4300             : 
    4301             :   // XXX - Why doesn't this get called when vector_shuffle is expanded?
    4302        6568 :   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
    4303           7 :     return Combined;
    4304             : 
    4305        6561 :   unsigned EltSize = EltVT.getSizeInBits();
    4306             :   assert(isPowerOf2_32(EltSize));
    4307             : 
    4308        6561 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4309        6561 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4310             : 
    4311             :   // Convert vector index to bit-index (* EltSize)
    4312        6561 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4313             : 
    4314        6561 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4315        6561 :   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
    4316             : 
    4317             :   if (ResultVT == MVT::f16) {
    4318        1652 :     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
    4319        1652 :     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    4320             :   }
    4321             : 
    4322        4909 :   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
    4323             : }
    4324             : 
    4325        1255 : SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
    4326             :                                             SelectionDAG &DAG) const {
    4327             :   SDLoc SL(Op);
    4328        1255 :   EVT VT = Op.getValueType();
    4329             : 
    4330             :   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
    4331         292 :     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
    4332             : 
    4333             :     // Turn into pair of packed build_vectors.
    4334             :     // TODO: Special case for constants that can be materialized with s_mov_b64.
    4335             :     SDValue Lo = DAG.getBuildVector(HalfVT, SL,
    4336         584 :                                     { Op.getOperand(0), Op.getOperand(1) });
    4337             :     SDValue Hi = DAG.getBuildVector(HalfVT, SL,
    4338         584 :                                     { Op.getOperand(2), Op.getOperand(3) });
    4339             : 
    4340         292 :     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
    4341         292 :     SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
    4342             : 
    4343         584 :     SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
    4344         292 :     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
    4345             :   }
    4346             : 
    4347             :   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
    4348             :   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
    4349             : 
    4350         963 :   SDValue Lo = Op.getOperand(0);
    4351         963 :   SDValue Hi = Op.getOperand(1);
    4352             : 
    4353             :   // Avoid adding defined bits with the zero_extend.
    4354         963 :   if (Hi.isUndef()) {
    4355          33 :     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
    4356          33 :     SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
    4357          33 :     return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
    4358             :   }
    4359             : 
    4360         930 :   Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
    4361         930 :   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
    4362             : 
    4363             :   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
    4364         930 :                               DAG.getConstant(16, SL, MVT::i32));
    4365         930 :   if (Lo.isUndef())
    4366          11 :     return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
    4367             : 
    4368         919 :   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
    4369         919 :   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
    4370             : 
    4371         919 :   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
    4372         919 :   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
    4373             : }
    4374             : 
    4375             : bool
    4376        1725 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    4377             :   // We can fold offsets for anything that doesn't require a GOT relocation.
    4378        3413 :   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
    4379        3307 :           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
    4380        3450 :           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
    4381         106 :          !shouldEmitGOTReloc(GA->getGlobal());
    4382             : }
    4383             : 
    4384             : static SDValue
    4385         613 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
    4386             :                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
    4387             :                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
    4388             :   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
    4389             :   // lowered to the following code sequence:
    4390             :   //
    4391             :   // For constant address space:
    4392             :   //   s_getpc_b64 s[0:1]
    4393             :   //   s_add_u32 s0, s0, $symbol
    4394             :   //   s_addc_u32 s1, s1, 0
    4395             :   //
    4396             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4397             :   //   a fixup or relocation is emitted to replace $symbol with a literal
    4398             :   //   constant, which is a pc-relative offset from the encoding of the $symbol
    4399             :   //   operand to the global variable.
    4400             :   //
    4401             :   // For global address space:
    4402             :   //   s_getpc_b64 s[0:1]
    4403             :   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
    4404             :   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
    4405             :   //
    4406             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4407             :   //   fixups or relocations are emitted to replace $symbol@*@lo and
    4408             :   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
    4409             :   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
    4410             :   //   operand to the global variable.
    4411             :   //
    4412             :   // What we want here is an offset from the value returned by s_getpc
    4413             :   // (which is the address of the s_add_u32 instruction) to the global
    4414             :   // variable, but since the encoding of $symbol starts 4 bytes after the start
    4415             :   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
    4416             :   // small. This requires us to add 4 to the global variable offset in order to
    4417             :   // compute the correct address.
    4418         613 :   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4419         613 :                                              GAFlags);
    4420             :   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4421             :                                              GAFlags == SIInstrInfo::MO_NONE ?
    4422        1226 :                                              GAFlags : GAFlags + 1);
    4423         613 :   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
    4424             : }
    4425             : 
    4426        1026 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    4427             :                                              SDValue Op,
    4428             :                                              SelectionDAG &DAG) const {
    4429             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    4430        1026 :   const GlobalValue *GV = GSD->getGlobal();
    4431        1639 :   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
    4432        1639 :       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
    4433         613 :       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
    4434         413 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    4435             : 
    4436             :   SDLoc DL(GSD);
    4437         613 :   EVT PtrVT = Op.getValueType();
    4438             : 
    4439             :   // FIXME: Should not make address space based decisions here.
    4440         613 :   if (shouldEmitFixup(GV))
    4441          25 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    4442         588 :   else if (shouldEmitPCReloc(GV))
    4443         563 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
    4444         563 :                                    SIInstrInfo::MO_REL32);
    4445             : 
    4446             :   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
    4447          25 :                                             SIInstrInfo::MO_GOTPCREL32);
    4448             : 
    4449          25 :   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
    4450          25 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
    4451          25 :   const DataLayout &DataLayout = DAG.getDataLayout();
    4452          25 :   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
    4453             :   MachinePointerInfo PtrInfo
    4454          25 :     = MachinePointerInfo::getGOT(DAG.getMachineFunction());
    4455             : 
    4456             :   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
    4457             :                      MachineMemOperand::MODereferenceable |
    4458          25 :                          MachineMemOperand::MOInvariant);
    4459             : }
    4460             : 
    4461        9146 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
    4462             :                                    const SDLoc &DL, SDValue V) const {
    4463             :   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
    4464             :   // the destination register.
    4465             :   //
    4466             :   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
    4467             :   // so we will end up with redundant moves to m0.
    4468             :   //
    4469             :   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
    4470             : 
    4471             :   // A Null SDValue creates a glue result.
    4472        9146 :   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
    4473             :                                   V, Chain);
    4474        9146 :   return SDValue(M0, 0);
    4475             : }
    4476             : 
    4477          91 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
    4478             :                                                  SDValue Op,
    4479             :                                                  MVT VT,
    4480             :                                                  unsigned Offset) const {
    4481             :   SDLoc SL(Op);
    4482             :   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
    4483         182 :                                            DAG.getEntryNode(), Offset, 4, false);
    4484             :   // The local size values will have the hi 16-bits as zero.
    4485             :   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
    4486          91 :                      DAG.getValueType(VT));
    4487             : }
    4488             : 
    4489           2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4490             :                                         EVT VT) {
    4491           2 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4492             :                                       "non-hsa intrinsic with hsa target",
    4493           2 :                                       DL.getDebugLoc());
    4494           2 :   DAG.getContext()->diagnose(BadIntrin);
    4495           2 :   return DAG.getUNDEF(VT);
    4496             : }
    4497             : 
    4498           5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4499             :                                          EVT VT) {
    4500           5 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4501             :                                       "intrinsic not supported on subtarget",
    4502           5 :                                       DL.getDebugLoc());
    4503           5 :   DAG.getContext()->diagnose(BadIntrin);
    4504           5 :   return DAG.getUNDEF(VT);
    4505             : }
    4506             : 
    4507         737 : static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
    4508             :                                     ArrayRef<SDValue> Elts) {
    4509             :   assert(!Elts.empty());
    4510             :   MVT Type;
    4511             :   unsigned NumElts;
    4512             : 
    4513         737 :   if (Elts.size() == 1) {
    4514             :     Type = MVT::f32;
    4515             :     NumElts = 1;
    4516         509 :   } else if (Elts.size() == 2) {
    4517             :     Type = MVT::v2f32;
    4518             :     NumElts = 2;
    4519         349 :   } else if (Elts.size() <= 4) {
    4520             :     Type = MVT::v4f32;
    4521             :     NumElts = 4;
    4522         101 :   } else if (Elts.size() <= 8) {
    4523             :     Type = MVT::v8f32;
    4524             :     NumElts = 8;
    4525             :   } else {
    4526             :     assert(Elts.size() <= 16);
    4527             :     Type = MVT::v16f32;
    4528             :     NumElts = 16;
    4529             :   }
    4530             : 
    4531         737 :   SmallVector<SDValue, 16> VecElts(NumElts);
    4532        2760 :   for (unsigned i = 0; i < Elts.size(); ++i) {
    4533        4046 :     SDValue Elt = Elts[i];
    4534             :     if (Elt.getValueType() != MVT::f32)
    4535         714 :       Elt = DAG.getBitcast(MVT::f32, Elt);
    4536        2023 :     VecElts[i] = Elt;
    4537             :   }
    4538        1126 :   for (unsigned i = Elts.size(); i < NumElts; ++i)
    4539         389 :     VecElts[i] = DAG.getUNDEF(MVT::f32);
    4540             : 
    4541         737 :   if (NumElts == 1)
    4542         228 :     return VecElts[0];
    4543         509 :   return DAG.getBuildVector(Type, DL, VecElts);
    4544             : }
    4545             : 
    4546           0 : static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
    4547             :                              SDValue *GLC, SDValue *SLC) {
    4548             :   auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
    4549             :   if (!CachePolicyConst)
    4550           0 :     return false;
    4551             : 
    4552           0 :   uint64_t Value = CachePolicyConst->getZExtValue();
    4553           0 :   SDLoc DL(CachePolicy);
    4554           0 :   if (GLC) {
    4555           0 :     *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
    4556           0 :     Value &= ~(uint64_t)0x1;
    4557             :   }
    4558           0 :   if (SLC) {
    4559           0 :     *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
    4560           0 :     Value &= ~(uint64_t)0x2;
    4561             :   }
    4562             : 
    4563           0 :   return Value == 0;
    4564             : }
    4565             : 
    4566         744 : SDValue SITargetLowering::lowerImage(SDValue Op,
    4567             :                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
    4568             :                                      SelectionDAG &DAG) const {
    4569             :   SDLoc DL(Op);
    4570         744 :   MachineFunction &MF = DAG.getMachineFunction();
    4571         744 :   const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
    4572             :   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
    4573         744 :       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
    4574         744 :   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
    4575             :   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
    4576         744 :       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
    4577             :   unsigned IntrOpcode = Intr->BaseOpcode;
    4578             : 
    4579         744 :   SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
    4580             :   bool IsD16 = false;
    4581             :   bool IsA16 = false;
    4582         744 :   SDValue VData;
    4583             :   int NumVDataDwords;
    4584             :   unsigned AddrIdx; // Index of first address argument
    4585             :   unsigned DMask;
    4586             : 
    4587         744 :   if (BaseOpcode->Atomic) {
    4588          42 :     VData = Op.getOperand(2);
    4589             : 
    4590          42 :     bool Is64Bit = VData.getValueType() == MVT::i64;
    4591          42 :     if (BaseOpcode->AtomicX2) {
    4592           2 :       SDValue VData2 = Op.getOperand(3);
    4593           2 :       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
    4594           6 :                                  {VData, VData2});
    4595           2 :       if (Is64Bit)
    4596           0 :         VData = DAG.getBitcast(MVT::v4i32, VData);
    4597             : 
    4598           2 :       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
    4599           2 :       DMask = Is64Bit ? 0xf : 0x3;
    4600           2 :       NumVDataDwords = Is64Bit ? 4 : 2;
    4601             :       AddrIdx = 4;
    4602             :     } else {
    4603          40 :       DMask = Is64Bit ? 0x3 : 0x1;
    4604          40 :       NumVDataDwords = Is64Bit ? 2 : 1;
    4605             :       AddrIdx = 3;
    4606             :     }
    4607             :   } else {
    4608             :     unsigned DMaskIdx;
    4609             : 
    4610         702 :     if (BaseOpcode->Store) {
    4611          87 :       VData = Op.getOperand(2);
    4612             : 
    4613          87 :       MVT StoreVT = VData.getSimpleValueType();
    4614          87 :       if (StoreVT.getScalarType() == MVT::f16) {
    4615          12 :         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
    4616          12 :             !BaseOpcode->HasD16)
    4617           0 :           return Op; // D16 is unsupported for this instruction
    4618             : 
    4619             :         IsD16 = true;
    4620          12 :         VData = handleD16VData(VData, DAG);
    4621             :       }
    4622             : 
    4623         174 :       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
    4624             :       DMaskIdx = 3;
    4625             :     } else {
    4626         615 :       MVT LoadVT = Op.getSimpleValueType();
    4627         615 :       if (LoadVT.getScalarType() == MVT::f16) {
    4628          27 :         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
    4629          27 :             !BaseOpcode->HasD16)
    4630           0 :           return Op; // D16 is unsupported for this instruction
    4631             : 
    4632             :         IsD16 = true;
    4633          54 :         if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
    4634          11 :           ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
    4635             :       }
    4636             : 
    4637         615 :       NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
    4638         615 :       DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
    4639             :     }
    4640             : 
    4641             :     auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
    4642             :     if (!DMaskConst)
    4643           0 :       return Op;
    4644             : 
    4645         702 :     AddrIdx = DMaskIdx + 1;
    4646         702 :     DMask = DMaskConst->getZExtValue();
    4647         702 :     if (!DMask && !BaseOpcode->Store) {
    4648             :       // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
    4649             :       // store the channels' default values.
    4650           7 :       SDValue Undef = DAG.getUNDEF(Op.getValueType());
    4651           7 :       if (isa<MemSDNode>(Op))
    4652           6 :         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
    4653           5 :       return Undef;
    4654             :     }
    4655             :   }
    4656             : 
    4657         737 :   unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
    4658         737 :   unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
    4659         737 :   unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
    4660         737 :   unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
    4661             :                        NumCoords + NumLCM;
    4662             :   unsigned NumMIVAddrs = NumVAddrs;
    4663             : 
    4664             :   SmallVector<SDValue, 4> VAddrs;
    4665             : 
    4666             :   // Optimize _L to _LZ when _L is zero
    4667         737 :   if (LZMappingInfo) {
    4668             :     if (auto ConstantLod =
    4669          60 :          dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
    4670          60 :       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
    4671          30 :         IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
    4672          30 :         NumMIVAddrs--;               // remove 'lod'
    4673             :       }
    4674             :     }
    4675             :   }
    4676             : 
    4677             :   // Check for 16 bit addresses and pack if true.
    4678         737 :   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
    4679         737 :   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
    4680         737 :   if (VAddrVT.getScalarType() == MVT::f16 &&
    4681             :       ST->hasFeature(AMDGPU::FeatureR128A16)) {
    4682             :     IsA16 = true;
    4683         228 :     for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
    4684         167 :       SDValue AddrLo, AddrHi;
    4685             :       // Push back extra arguments.
    4686         167 :       if (i < DimIdx) {
    4687          42 :         AddrLo = Op.getOperand(i);
    4688             :       } else {
    4689         125 :         AddrLo = Op.getOperand(i);
    4690             :         // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
    4691             :         // in 1D, derivatives dx/dh and dx/dv are packed with undef.
    4692         125 :         if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
    4693          91 :             ((NumGradients / 2) % 2 == 1 &&
    4694          25 :             (i == DimIdx + (NumGradients / 2) - 1 ||
    4695          16 :              i == DimIdx + NumGradients - 1))) {
    4696          52 :           AddrHi = DAG.getUNDEF(MVT::f16);
    4697             :         } else {
    4698          73 :           AddrHi = Op.getOperand(i + 1);
    4699             :           i++;
    4700             :         }
    4701         125 :         AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
    4702         250 :                              {AddrLo, AddrHi});
    4703         125 :         AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
    4704             :       }
    4705         167 :       VAddrs.push_back(AddrLo);
    4706             :     }
    4707             :   } else {
    4708        2532 :     for (unsigned i = 0; i < NumMIVAddrs; ++i)
    4709        3712 :       VAddrs.push_back(Op.getOperand(AddrIdx + i));
    4710             :   }
    4711             : 
    4712        1474 :   SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
    4713             : 
    4714         737 :   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
    4715         737 :   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
    4716             :   unsigned CtrlIdx; // Index of texfailctrl argument
    4717         737 :   SDValue Unorm;
    4718         737 :   if (!BaseOpcode->Sampler) {
    4719         239 :     Unorm = True;
    4720         239 :     CtrlIdx = AddrIdx + NumVAddrs + 1;
    4721             :   } else {
    4722             :     auto UnormConst =
    4723         498 :         dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
    4724             :     if (!UnormConst)
    4725           0 :       return Op;
    4726             : 
    4727         996 :     Unorm = UnormConst->getZExtValue() ? True : False;
    4728         498 :     CtrlIdx = AddrIdx + NumVAddrs + 3;
    4729             :   }
    4730             : 
    4731         737 :   SDValue TexFail = Op.getOperand(CtrlIdx);
    4732             :   auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
    4733        1474 :   if (!TexFailConst || TexFailConst->getZExtValue() != 0)
    4734           0 :     return Op;
    4735             : 
    4736         737 :   SDValue GLC;
    4737         737 :   SDValue SLC;
    4738         737 :   if (BaseOpcode->Atomic) {
    4739          42 :     GLC = True; // TODO no-return optimization
    4740          84 :     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
    4741           0 :       return Op;
    4742             :   } else {
    4743        1390 :     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
    4744           0 :       return Op;
    4745             :   }
    4746             : 
    4747             :   SmallVector<SDValue, 14> Ops;
    4748         737 :   if (BaseOpcode->Store || BaseOpcode->Atomic)
    4749         129 :     Ops.push_back(VData); // vdata
    4750         737 :   Ops.push_back(VAddr);
    4751        1474 :   Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
    4752         737 :   if (BaseOpcode->Sampler)
    4753         996 :     Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
    4754         737 :   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
    4755         737 :   Ops.push_back(Unorm);
    4756         737 :   Ops.push_back(GLC);
    4757         737 :   Ops.push_back(SLC);
    4758         737 :   Ops.push_back(IsA16 &&  // a16 or r128
    4759             :                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
    4760         737 :   Ops.push_back(False); // tfe
    4761         737 :   Ops.push_back(False); // lwe
    4762        1389 :   Ops.push_back(DimInfo->DA ? True : False);
    4763         737 :   if (BaseOpcode->HasD16)
    4764        1289 :     Ops.push_back(IsD16 ? True : False);
    4765         737 :   if (isa<MemSDNode>(Op))
    4766         706 :     Ops.push_back(Op.getOperand(0)); // chain
    4767             : 
    4768         737 :   int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
    4769             :   int Opcode = -1;
    4770             : 
    4771         737 :   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4772         443 :     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
    4773             :                                    NumVDataDwords, NumVAddrDwords);
    4774         443 :   if (Opcode == -1)
    4775         716 :     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
    4776             :                                    NumVDataDwords, NumVAddrDwords);
    4777             :   assert(Opcode != -1);
    4778             : 
    4779         737 :   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
    4780             :   if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
    4781         706 :     MachineMemOperand *MemRef = MemOp->getMemOperand();
    4782        1412 :     DAG.setNodeMemRefs(NewNode, {MemRef});
    4783             :   }
    4784             : 
    4785         737 :   if (BaseOpcode->AtomicX2) {
    4786             :     SmallVector<SDValue, 1> Elt;
    4787           2 :     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
    4788           4 :     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
    4789         735 :   } else if (IsD16 && !BaseOpcode->Store) {
    4790             :     MVT LoadVT = Op.getSimpleValueType();
    4791             :     SDValue Adjusted = adjustLoadValueTypeImpl(
    4792          54 :         SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
    4793          54 :     return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
    4794             :   }
    4795             : 
    4796         708 :   return SDValue(NewNode, 0);
    4797             : }
    4798             : 
    4799         528 : SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
    4800             :                                        SDValue Offset, SDValue GLC,
    4801             :                                        SelectionDAG &DAG) const {
    4802         528 :   MachineFunction &MF = DAG.getMachineFunction();
    4803        1056 :   MachineMemOperand *MMO = MF.getMachineMemOperand(
    4804             :       MachinePointerInfo(),
    4805             :       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    4806             :           MachineMemOperand::MOInvariant,
    4807             :       VT.getStoreSize(), VT.getStoreSize());
    4808             : 
    4809        1056 :   if (!Offset->isDivergent()) {
    4810             :     SDValue Ops[] = {
    4811             :         Rsrc,
    4812             :         Offset, // Offset
    4813             :         GLC     // glc
    4814         464 :     };
    4815             :     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
    4816         464 :                                    DAG.getVTList(VT), Ops, VT, MMO);
    4817             :   }
    4818             : 
    4819             :   // We have a divergent offset. Emit a MUBUF buffer load instead. We can
    4820             :   // assume that the buffer is unswizzled.
    4821             :   SmallVector<SDValue, 4> Loads;
    4822             :   unsigned NumLoads = 1;
    4823             :   MVT LoadVT = VT.getSimpleVT();
    4824             : 
    4825             :   assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
    4826             :          LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
    4827             : 
    4828             :   if (VT == MVT::v8i32 || VT == MVT::v16i32) {
    4829             :     NumLoads = VT == MVT::v16i32 ? 4 : 2;
    4830             :     LoadVT = MVT::v4i32;
    4831             :   }
    4832             : 
    4833         128 :   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
    4834          64 :   unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
    4835             :   SDValue Ops[] = {
    4836          64 :       DAG.getEntryNode(),                         // Chain
    4837             :       Rsrc,                                       // rsrc
    4838          64 :       DAG.getConstant(0, DL, MVT::i32),           // vindex
    4839             :       {},                                         // voffset
    4840             :       {},                                         // soffset
    4841             :       {},                                         // offset
    4842          64 :       DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
    4843          64 :       DAG.getConstant(0, DL, MVT::i1),            // idxen
    4844          64 :   };
    4845             : 
    4846             :   // Use the alignment to ensure that the required offsets will fit into the
    4847             :   // immediate offsets.
    4848          64 :   setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
    4849             : 
    4850          64 :   uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
    4851         168 :   for (unsigned i = 0; i < NumLoads; ++i) {
    4852         104 :     Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
    4853         104 :     Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
    4854         208 :                                             Ops, LoadVT, MMO));
    4855             :   }
    4856             : 
    4857             :   if (VT == MVT::v8i32 || VT == MVT::v16i32)
    4858          16 :     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
    4859             : 
    4860          48 :   return Loads[0];
    4861             : }
    4862             : 
    4863       21890 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    4864             :                                                   SelectionDAG &DAG) const {
    4865       21890 :   MachineFunction &MF = DAG.getMachineFunction();
    4866       21890 :   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    4867             : 
    4868       21890 :   EVT VT = Op.getValueType();
    4869             :   SDLoc DL(Op);
    4870       21890 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4871             : 
    4872             :   // TODO: Should this propagate fast-math-flags?
    4873             : 
    4874       21890 :   switch (IntrinsicID) {
    4875           4 :   case Intrinsic::amdgcn_implicit_buffer_ptr: {
    4876           4 :     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
    4877           2 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4878             :     return getPreloadedValue(DAG, *MFI, VT,
    4879           2 :                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
    4880             :   }
    4881          54 :   case Intrinsic::amdgcn_dispatch_ptr:
    4882             :   case Intrinsic::amdgcn_queue_ptr: {
    4883          54 :     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
    4884             :       DiagnosticInfoUnsupported BadIntrin(
    4885             :           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
    4886           2 :           DL.getDebugLoc());
    4887           2 :       DAG.getContext()->diagnose(BadIntrin);
    4888           2 :       return DAG.getUNDEF(VT);
    4889             :     }
    4890             : 
    4891          52 :     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
    4892             :       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
    4893          52 :     return getPreloadedValue(DAG, *MFI, VT, RegID);
    4894             :   }
    4895          40 :   case Intrinsic::amdgcn_implicitarg_ptr: {
    4896          40 :     if (MFI->isEntryFunction())
    4897          32 :       return getImplicitArgPtr(DAG, DL);
    4898             :     return getPreloadedValue(DAG, *MFI, VT,
    4899           8 :                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    4900             :   }
    4901       13565 :   case Intrinsic::amdgcn_kernarg_segment_ptr: {
    4902             :     return getPreloadedValue(DAG, *MFI, VT,
    4903       13565 :                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    4904             :   }
    4905           9 :   case Intrinsic::amdgcn_dispatch_id: {
    4906           9 :     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
    4907             :   }
    4908             :   case Intrinsic::amdgcn_rcp:
    4909          29 :     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    4910             :   case Intrinsic::amdgcn_rsq:
    4911          33 :     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4912           5 :   case Intrinsic::amdgcn_rsq_legacy:
    4913           5 :     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4914           1 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4915             : 
    4916           4 :     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    4917          11 :   case Intrinsic::amdgcn_rcp_legacy:
    4918          11 :     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4919           4 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4920           7 :     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
    4921           6 :   case Intrinsic::amdgcn_rsq_clamp: {
    4922           6 :     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4923           3 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    4924             : 
    4925           3 :     Type *Type = VT.getTypeForEVT(*DAG.getContext());
    4926           3 :     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    4927           3 :     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    4928             : 
    4929           3 :     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4930             :     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    4931           3 :                               DAG.getConstantFP(Max, DL, VT));
    4932             :     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    4933           3 :                        DAG.getConstantFP(Min, DL, VT));
    4934             :   }
    4935           2 :   case Intrinsic::r600_read_ngroups_x:
    4936           4 :     if (Subtarget->isAmdHsaOS())
    4937           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4938             : 
    4939             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4940           2 :                                     SI::KernelInputOffsets::NGROUPS_X, 4, false);
    4941           2 :   case Intrinsic::r600_read_ngroups_y:
    4942           4 :     if (Subtarget->isAmdHsaOS())
    4943           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4944             : 
    4945             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4946           2 :                                     SI::KernelInputOffsets::NGROUPS_Y, 4, false);
    4947           2 :   case Intrinsic::r600_read_ngroups_z:
    4948           4 :     if (Subtarget->isAmdHsaOS())
    4949           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4950             : 
    4951             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4952           2 :                                     SI::KernelInputOffsets::NGROUPS_Z, 4, false);
    4953           2 :   case Intrinsic::r600_read_global_size_x:
    4954           4 :     if (Subtarget->isAmdHsaOS())
    4955           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4956             : 
    4957             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4958           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
    4959           2 :   case Intrinsic::r600_read_global_size_y:
    4960           4 :     if (Subtarget->isAmdHsaOS())
    4961           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4962             : 
    4963             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4964           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
    4965           2 :   case Intrinsic::r600_read_global_size_z:
    4966           4 :     if (Subtarget->isAmdHsaOS())
    4967           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4968             : 
    4969             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4970           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
    4971          13 :   case Intrinsic::r600_read_local_size_x:
    4972          26 :     if (Subtarget->isAmdHsaOS())
    4973           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4974             : 
    4975             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4976          13 :                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
    4977          39 :   case Intrinsic::r600_read_local_size_y:
    4978          78 :     if (Subtarget->isAmdHsaOS())
    4979           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4980             : 
    4981             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4982          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
    4983          39 :   case Intrinsic::r600_read_local_size_z:
    4984          78 :     if (Subtarget->isAmdHsaOS())
    4985           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4986             : 
    4987             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4988          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
    4989          49 :   case Intrinsic::amdgcn_workgroup_id_x:
    4990             :   case Intrinsic::r600_read_tgid_x:
    4991             :     return getPreloadedValue(DAG, *MFI, VT,
    4992          49 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    4993          24 :   case Intrinsic::amdgcn_workgroup_id_y:
    4994             :   case Intrinsic::r600_read_tgid_y:
    4995             :     return getPreloadedValue(DAG, *MFI, VT,
    4996          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    4997          24 :   case Intrinsic::amdgcn_workgroup_id_z:
    4998             :   case Intrinsic::r600_read_tgid_z:
    4999             :     return getPreloadedValue(DAG, *MFI, VT,
    5000          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    5001        3280 :   case Intrinsic::amdgcn_workitem_id_x: {
    5002             :   case Intrinsic::r600_read_tidig_x:
    5003             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    5004        3280 :                           SDLoc(DAG.getEntryNode()),
    5005        3280 :                           MFI->getArgInfo().WorkItemIDX);
    5006             :   }
    5007         125 :   case Intrinsic::amdgcn_workitem_id_y:
    5008             :   case Intrinsic::r600_read_tidig_y:
    5009             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    5010         125 :                           SDLoc(DAG.getEntryNode()),
    5011         125 :                           MFI->getArgInfo().WorkItemIDY);
    5012          74 :   case Intrinsic::amdgcn_workitem_id_z:
    5013             :   case Intrinsic::r600_read_tidig_z:
    5014             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    5015          74 :                           SDLoc(DAG.getEntryNode()),
    5016          74 :                           MFI->getArgInfo().WorkItemIDZ);
    5017             :   case AMDGPUIntrinsic::SI_load_const: {
    5018             :     SDValue Load =
    5019             :         lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
    5020         468 :                      DAG.getTargetConstant(0, DL, MVT::i1), DAG);
    5021         468 :     return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
    5022             :   }
    5023          60 :   case Intrinsic::amdgcn_s_buffer_load: {
    5024          60 :     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
    5025             :     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
    5026         120 :                         DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
    5027             :   }
    5028          33 :   case Intrinsic::amdgcn_fdiv_fast:
    5029          33 :     return lowerFDIV_FAST(Op, DAG);
    5030          84 :   case Intrinsic::amdgcn_interp_mov: {
    5031          84 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    5032          84 :     SDValue Glue = M0.getValue(1);
    5033             :     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
    5034          84 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    5035             :   }
    5036         215 :   case Intrinsic::amdgcn_interp_p1: {
    5037         215 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    5038         215 :     SDValue Glue = M0.getValue(1);
    5039             :     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
    5040         215 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    5041             :   }
    5042         199 :   case Intrinsic::amdgcn_interp_p2: {
    5043         199 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
    5044         199 :     SDValue Glue = SDValue(M0.getNode(), 1);
    5045             :     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
    5046             :                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
    5047         199 :                        Glue);
    5048             :   }
    5049             :   case Intrinsic::amdgcn_sin:
    5050           7 :     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
    5051             : 
    5052             :   case Intrinsic::amdgcn_cos:
    5053           3 :     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
    5054             : 
    5055           3 :   case Intrinsic::amdgcn_log_clamp: {
    5056           3 :     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
    5057           2 :       return SDValue();
    5058             : 
    5059             :     DiagnosticInfoUnsupported BadIntrin(
    5060             :       MF.getFunction(), "intrinsic not supported on subtarget",
    5061           1 :       DL.getDebugLoc());
    5062           1 :       DAG.getContext()->diagnose(BadIntrin);
    5063           1 :       return DAG.getUNDEF(VT);
    5064             :   }
    5065             :   case Intrinsic::amdgcn_ldexp:
    5066             :     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
    5067          10 :                        Op.getOperand(1), Op.getOperand(2));
    5068             : 
    5069             :   case Intrinsic::amdgcn_fract:
    5070           8 :     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    5071             : 
    5072             :   case Intrinsic::amdgcn_class:
    5073             :     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
    5074          64 :                        Op.getOperand(1), Op.getOperand(2));
    5075          10 :   case Intrinsic::amdgcn_div_fmas:
    5076             :     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    5077             :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    5078          10 :                        Op.getOperand(4));
    5079             : 
    5080             :   case Intrinsic::amdgcn_div_fixup:
    5081             :     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    5082          13 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    5083             : 
    5084             :   case Intrinsic::amdgcn_trig_preop:
    5085             :     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    5086           4 :                        Op.getOperand(1), Op.getOperand(2));
    5087          27 :   case Intrinsic::amdgcn_div_scale: {
    5088             :     // 3rd parameter required to be a constant.
    5089             :     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    5090             :     if (!Param)
    5091           3 :       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
    5092             : 
    5093             :     // Translate to the operands expected by the machine instruction. The
    5094             :     // first parameter must be the same as the first instruction.
    5095          24 :     SDValue Numerator = Op.getOperand(1);
    5096          24 :     SDValue Denominator = Op.getOperand(2);
    5097             : 
    5098             :     // Note this order is opposite of the machine instruction's operations,
    5099             :     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    5100             :     // intrinsic has the numerator as the first operand to match a normal
    5101             :     // division operation.
    5102             : 
    5103          48 :     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    5104             : 
    5105             :     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    5106          48 :                        Denominator, Numerator);
    5107             :   }
    5108          72 :   case Intrinsic::amdgcn_icmp: {
    5109          72 :     return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
    5110             :   }
    5111          86 :   case Intrinsic::amdgcn_fcmp: {
    5112          86 :     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
    5113             :   }
    5114             :   case Intrinsic::amdgcn_fmed3:
    5115             :     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
    5116          84 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    5117           2 :   case Intrinsic::amdgcn_fdot2:
    5118             :     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
    5119             :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    5120           2 :                        Op.getOperand(4));
    5121             :   case Intrinsic::amdgcn_fmul_legacy:
    5122             :     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
    5123          57 :                        Op.getOperand(1), Op.getOperand(2));
    5124             :   case Intrinsic::amdgcn_sffbh:
    5125           4 :     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
    5126             :   case Intrinsic::amdgcn_sbfe:
    5127             :     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    5128         102 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    5129             :   case Intrinsic::amdgcn_ubfe:
    5130             :     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    5131          94 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    5132         103 :   case Intrinsic::amdgcn_cvt_pkrtz:
    5133             :   case Intrinsic::amdgcn_cvt_pknorm_i16:
    5134             :   case Intrinsic::amdgcn_cvt_pknorm_u16:
    5135             :   case Intrinsic::amdgcn_cvt_pk_i16:
    5136             :   case Intrinsic::amdgcn_cvt_pk_u16: {
    5137             :     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
    5138         103 :     EVT VT = Op.getValueType();
    5139             :     unsigned Opcode;
    5140             : 
    5141         103 :     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
    5142             :       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
    5143          56 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
    5144             :       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    5145          38 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
    5146             :       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    5147          20 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
    5148             :       Opcode = AMDGPUISD::CVT_PK_I16_I32;
    5149             :     else
    5150             :       Opcode = AMDGPUISD::CVT_PK_U16_U32;
    5151             : 
    5152             :     if (isTypeLegal(VT))
    5153         103 :       return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
    5154             : 
    5155             :     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
    5156           0 :                                Op.getOperand(1), Op.getOperand(2));
    5157           0 :     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    5158             :   }
    5159          13 :   case Intrinsic::amdgcn_wqm: {
    5160          13 :     SDValue Src = Op.getOperand(1);
    5161          13 :     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
    5162          13 :                    0);
    5163             :   }
    5164         278 :   case Intrinsic::amdgcn_wwm: {
    5165         278 :     SDValue Src = Op.getOperand(1);
    5166         278 :     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
    5167         278 :                    0);
    5168             :   }
    5169             :   case Intrinsic::amdgcn_fmad_ftz:
    5170             :     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
    5171         111 :                        Op.getOperand(2), Op.getOperand(3));
    5172        2241 :   default:
    5173        2241 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    5174        2241 :             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
    5175        2241 :       return lowerImage(Op, ImageDimIntr, DAG);
    5176             : 
    5177        2205 :     return Op;
    5178             :   }
    5179             : }
    5180             : 
    5181        1617 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
    5182             :                                                  SelectionDAG &DAG) const {
    5183        3234 :   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    5184             :   SDLoc DL(Op);
    5185             : 
    5186        1617 :   switch (IntrID) {
    5187             :   case Intrinsic::amdgcn_atomic_inc:
    5188             :   case Intrinsic::amdgcn_atomic_dec:
    5189             :   case Intrinsic::amdgcn_ds_fadd:
    5190             :   case Intrinsic::amdgcn_ds_fmin:
    5191             :   case Intrinsic::amdgcn_ds_fmax: {
    5192             :     MemSDNode *M = cast<MemSDNode>(Op);
    5193             :     unsigned Opc;
    5194             :     switch (IntrID) {
    5195             :     case Intrinsic::amdgcn_atomic_inc:
    5196             :       Opc = AMDGPUISD::ATOMIC_INC;
    5197             :       break;
    5198         115 :     case Intrinsic::amdgcn_atomic_dec:
    5199             :       Opc = AMDGPUISD::ATOMIC_DEC;
    5200         115 :       break;
    5201           6 :     case Intrinsic::amdgcn_ds_fadd:
    5202             :       Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
    5203           6 :       break;
    5204           6 :     case Intrinsic::amdgcn_ds_fmin:
    5205             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
    5206           6 :       break;
    5207           6 :     case Intrinsic::amdgcn_ds_fmax:
    5208             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
    5209           6 :       break;
    5210           0 :     default:
    5211           0 :       llvm_unreachable("Unknown intrinsic!");
    5212             :     }
    5213             :     SDValue Ops[] = {
    5214         245 :       M->getOperand(0), // Chain
    5215             :       M->getOperand(2), // Ptr
    5216             :       M->getOperand(3)  // Value
    5217         245 :     };
    5218             : 
    5219         245 :     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
    5220         735 :                                    M->getMemoryVT(), M->getMemOperand());
    5221             :   }
    5222         202 :   case Intrinsic::amdgcn_buffer_load:
    5223             :   case Intrinsic::amdgcn_buffer_load_format: {
    5224         404 :     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
    5225         404 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
    5226             :     unsigned IdxEn = 1;
    5227             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
    5228         226 :       IdxEn = Idx->getZExtValue() != 0;
    5229             :     SDValue Ops[] = {
    5230             :       Op.getOperand(0), // Chain
    5231             :       Op.getOperand(2), // rsrc
    5232             :       Op.getOperand(3), // vindex
    5233             :       SDValue(),        // voffset -- will be set by setBufferOffsets
    5234             :       SDValue(),        // soffset -- will be set by setBufferOffsets
    5235             :       SDValue(),        // offset -- will be set by setBufferOffsets
    5236         202 :       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
    5237         202 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
    5238         202 :     };
    5239             : 
    5240         202 :     setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
    5241         202 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
    5242             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    5243             : 
    5244         202 :     EVT VT = Op.getValueType();
    5245         202 :     EVT IntVT = VT.changeTypeToInteger();
    5246             :     auto *M = cast<MemSDNode>(Op);
    5247         202 :     EVT LoadVT = Op.getValueType();
    5248             : 
    5249         211 :     if (LoadVT.getScalarType() == MVT::f16)
    5250             :       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
    5251           9 :                                  M, DAG, Ops);
    5252             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    5253         579 :                                    M->getMemOperand());
    5254             :   }
    5255         107 :   case Intrinsic::amdgcn_raw_buffer_load:
    5256             :   case Intrinsic::amdgcn_raw_buffer_load_format: {
    5257         107 :     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
    5258             :     SDValue Ops[] = {
    5259             :       Op.getOperand(0), // Chain
    5260             :       Op.getOperand(2), // rsrc
    5261         107 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5262             :       Offsets.first,    // voffset
    5263             :       Op.getOperand(4), // soffset
    5264             :       Offsets.second,   // offset
    5265             :       Op.getOperand(5), // cachepolicy
    5266         107 :       DAG.getConstant(0, DL, MVT::i1), // idxen
    5267         107 :     };
    5268             : 
    5269         107 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
    5270             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    5271             : 
    5272         107 :     EVT VT = Op.getValueType();
    5273         107 :     EVT IntVT = VT.changeTypeToInteger();
    5274             :     auto *M = cast<MemSDNode>(Op);
    5275         107 :     EVT LoadVT = Op.getValueType();
    5276             : 
    5277         116 :     if (LoadVT.getScalarType() == MVT::f16)
    5278             :       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
    5279           9 :                                  M, DAG, Ops);
    5280             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    5281         294 :                                    M->getMemOperand());
    5282             :   }
    5283          75 :   case Intrinsic::amdgcn_struct_buffer_load:
    5284             :   case Intrinsic::amdgcn_struct_buffer_load_format: {
    5285          75 :     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
    5286             :     SDValue Ops[] = {
    5287             :       Op.getOperand(0), // Chain
    5288             :       Op.getOperand(2), // rsrc
    5289             :       Op.getOperand(3), // vindex
    5290             :       Offsets.first,    // voffset
    5291             :       Op.getOperand(5), // soffset
    5292             :       Offsets.second,   // offset
    5293             :       Op.getOperand(6), // cachepolicy
    5294          75 :       DAG.getConstant(1, DL, MVT::i1), // idxen
    5295          75 :     };
    5296             : 
    5297          75 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
    5298             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    5299             : 
    5300          75 :     EVT VT = Op.getValueType();
    5301          75 :     EVT IntVT = VT.changeTypeToInteger();
    5302             :     auto *M = cast<MemSDNode>(Op);
    5303          75 :     EVT LoadVT = Op.getValueType();
    5304             : 
    5305          84 :     if (LoadVT.getScalarType() == MVT::f16)
    5306             :       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
    5307           9 :                                  M, DAG, Ops);
    5308             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    5309         198 :                                    M->getMemOperand());
    5310             :   }
    5311             :   case Intrinsic::amdgcn_tbuffer_load: {
    5312             :     MemSDNode *M = cast<MemSDNode>(Op);
    5313          37 :     EVT LoadVT = Op.getValueType();
    5314             : 
    5315          74 :     unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
    5316          74 :     unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
    5317          74 :     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
    5318          74 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
    5319             :     unsigned IdxEn = 1;
    5320             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
    5321          66 :       IdxEn = Idx->getZExtValue() != 0;
    5322             :     SDValue Ops[] = {
    5323             :       Op.getOperand(0),  // Chain
    5324             :       Op.getOperand(2),  // rsrc
    5325             :       Op.getOperand(3),  // vindex
    5326             :       Op.getOperand(4),  // voffset
    5327             :       Op.getOperand(5),  // soffset
    5328             :       Op.getOperand(6),  // offset
    5329          37 :       DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
    5330          37 :       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
    5331          37 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
    5332          37 :     };
    5333             : 
    5334          46 :     if (LoadVT.getScalarType() == MVT::f16)
    5335             :       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
    5336           9 :                                  M, DAG, Ops);
    5337             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    5338             :                                    Op->getVTList(), Ops, LoadVT,
    5339          84 :                                    M->getMemOperand());
    5340             :   }
    5341             :   case Intrinsic::amdgcn_raw_tbuffer_load: {
    5342             :     MemSDNode *M = cast<MemSDNode>(Op);
    5343          33 :     EVT LoadVT = Op.getValueType();
    5344          33 :     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
    5345             : 
    5346             :     SDValue Ops[] = {
    5347             :       Op.getOperand(0),  // Chain
    5348             :       Op.getOperand(2),  // rsrc
    5349          33 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5350             :       Offsets.first,     // voffset
    5351             :       Op.getOperand(4),  // soffset
    5352             :       Offsets.second,    // offset
    5353             :       Op.getOperand(5),  // format
    5354             :       Op.getOperand(6),  // cachepolicy
    5355          33 :       DAG.getConstant(0, DL, MVT::i1), // idxen
    5356          33 :     };
    5357             : 
    5358          42 :     if (LoadVT.getScalarType() == MVT::f16)
    5359             :       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
    5360           9 :                                  M, DAG, Ops);
    5361             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    5362             :                                    Op->getVTList(), Ops, LoadVT,
    5363          72 :                                    M->getMemOperand());
    5364             :   }
    5365             :   case Intrinsic::amdgcn_struct_tbuffer_load: {
    5366             :     MemSDNode *M = cast<MemSDNode>(Op);
    5367          37 :     EVT LoadVT = Op.getValueType();
    5368          37 :     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
    5369             : 
    5370             :     SDValue Ops[] = {
    5371             :       Op.getOperand(0),  // Chain
    5372             :       Op.getOperand(2),  // rsrc
    5373             :       Op.getOperand(3),  // vindex
    5374             :       Offsets.first,     // voffset
    5375             :       Op.getOperand(5),  // soffset
    5376             :       Offsets.second,    // offset
    5377             :       Op.getOperand(6),  // format
    5378             :       Op.getOperand(7),  // cachepolicy
    5379          37 :       DAG.getConstant(1, DL, MVT::i1), // idxen
    5380          37 :     };
    5381             : 
    5382          46 :     if (LoadVT.getScalarType() == MVT::f16)
    5383             :       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
    5384           9 :                                  M, DAG, Ops);
    5385             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    5386             :                                    Op->getVTList(), Ops, LoadVT,
    5387          84 :                                    M->getMemOperand());
    5388             :   }
    5389          58 :   case Intrinsic::amdgcn_buffer_atomic_swap:
    5390             :   case Intrinsic::amdgcn_buffer_atomic_add:
    5391             :   case Intrinsic::amdgcn_buffer_atomic_sub:
    5392             :   case Intrinsic::amdgcn_buffer_atomic_smin:
    5393             :   case Intrinsic::amdgcn_buffer_atomic_umin:
    5394             :   case Intrinsic::amdgcn_buffer_atomic_smax:
    5395             :   case Intrinsic::amdgcn_buffer_atomic_umax:
    5396             :   case Intrinsic::amdgcn_buffer_atomic_and:
    5397             :   case Intrinsic::amdgcn_buffer_atomic_or:
    5398             :   case Intrinsic::amdgcn_buffer_atomic_xor: {
    5399         116 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
    5400             :     unsigned IdxEn = 1;
    5401             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
    5402          60 :       IdxEn = Idx->getZExtValue() != 0;
    5403             :     SDValue Ops[] = {
    5404             :       Op.getOperand(0), // Chain
    5405             :       Op.getOperand(2), // vdata
    5406             :       Op.getOperand(3), // rsrc
    5407             :       Op.getOperand(4), // vindex
    5408             :       SDValue(),        // voffset -- will be set by setBufferOffsets
    5409             :       SDValue(),        // soffset -- will be set by setBufferOffsets
    5410             :       SDValue(),        // offset -- will be set by setBufferOffsets
    5411          58 :       DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
    5412          58 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
    5413          58 :     };
    5414          58 :     setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
    5415          58 :     EVT VT = Op.getValueType();
    5416             : 
    5417             :     auto *M = cast<MemSDNode>(Op);
    5418             :     unsigned Opcode = 0;
    5419             : 
    5420             :     switch (IntrID) {
    5421             :     case Intrinsic::amdgcn_buffer_atomic_swap:
    5422             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    5423             :       break;
    5424          16 :     case Intrinsic::amdgcn_buffer_atomic_add:
    5425             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    5426          16 :       break;
    5427          14 :     case Intrinsic::amdgcn_buffer_atomic_sub:
    5428             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    5429          14 :       break;
    5430           2 :     case Intrinsic::amdgcn_buffer_atomic_smin:
    5431             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    5432           2 :       break;
    5433           2 :     case Intrinsic::amdgcn_buffer_atomic_umin:
    5434             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    5435           2 :       break;
    5436           2 :     case Intrinsic::amdgcn_buffer_atomic_smax:
    5437             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    5438           2 :       break;
    5439           2 :     case Intrinsic::amdgcn_buffer_atomic_umax:
    5440             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    5441           2 :       break;
    5442           2 :     case Intrinsic::amdgcn_buffer_atomic_and:
    5443             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    5444           2 :       break;
    5445           2 :     case Intrinsic::amdgcn_buffer_atomic_or:
    5446             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    5447           2 :       break;
    5448           2 :     case Intrinsic::amdgcn_buffer_atomic_xor:
    5449             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    5450           2 :       break;
    5451           0 :     default:
    5452           0 :       llvm_unreachable("unhandled atomic opcode");
    5453             :     }
    5454             : 
    5455             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    5456         174 :                                    M->getMemOperand());
    5457             :   }
    5458          54 :   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
    5459             :   case Intrinsic::amdgcn_raw_buffer_atomic_add:
    5460             :   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
    5461             :   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
    5462             :   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
    5463             :   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
    5464             :   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
    5465             :   case Intrinsic::amdgcn_raw_buffer_atomic_and:
    5466             :   case Intrinsic::amdgcn_raw_buffer_atomic_or:
    5467             :   case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
    5468          54 :     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
    5469             :     SDValue Ops[] = {
    5470             :       Op.getOperand(0), // Chain
    5471             :       Op.getOperand(2), // vdata
    5472             :       Op.getOperand(3), // rsrc
    5473          54 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5474             :       Offsets.first,    // voffset
    5475             :       Op.getOperand(5), // soffset
    5476             :       Offsets.second,   // offset
    5477             :       Op.getOperand(6), // cachepolicy
    5478          54 :       DAG.getConstant(0, DL, MVT::i1), // idxen
    5479          54 :     };
    5480          54 :     EVT VT = Op.getValueType();
    5481             : 
    5482             :     auto *M = cast<MemSDNode>(Op);
    5483             :     unsigned Opcode = 0;
    5484             : 
    5485             :     switch (IntrID) {
    5486             :     case Intrinsic::amdgcn_raw_buffer_atomic_swap:
    5487             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    5488             :       break;
    5489          16 :     case Intrinsic::amdgcn_raw_buffer_atomic_add:
    5490             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    5491          16 :       break;
    5492          14 :     case Intrinsic::amdgcn_raw_buffer_atomic_sub:
    5493             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    5494          14 :       break;
    5495           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_smin:
    5496             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    5497           2 :       break;
    5498           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_umin:
    5499             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    5500           2 :       break;
    5501           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_smax:
    5502             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    5503           2 :       break;
    5504           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_umax:
    5505             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    5506           2 :       break;
    5507           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_and:
    5508             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    5509           2 :       break;
    5510           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_or:
    5511             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    5512           2 :       break;
    5513           2 :     case Intrinsic::amdgcn_raw_buffer_atomic_xor:
    5514             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    5515           2 :       break;
    5516           0 :     default:
    5517           0 :       llvm_unreachable("unhandled atomic opcode");
    5518             :     }
    5519             : 
    5520             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    5521         162 :                                    M->getMemOperand());
    5522             :   }
    5523          64 :   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
    5524             :   case Intrinsic::amdgcn_struct_buffer_atomic_add:
    5525             :   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
    5526             :   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
    5527             :   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
    5528             :   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
    5529             :   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
    5530             :   case Intrinsic::amdgcn_struct_buffer_atomic_and:
    5531             :   case Intrinsic::amdgcn_struct_buffer_atomic_or:
    5532             :   case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
    5533          64 :     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
    5534             :     SDValue Ops[] = {
    5535             :       Op.getOperand(0), // Chain
    5536             :       Op.getOperand(2), // vdata
    5537             :       Op.getOperand(3), // rsrc
    5538             :       Op.getOperand(4), // vindex
    5539             :       Offsets.first,    // voffset
    5540             :       Op.getOperand(6), // soffset
    5541             :       Offsets.second,   // offset
    5542             :       Op.getOperand(7), // cachepolicy
    5543          64 :       DAG.getConstant(1, DL, MVT::i1), // idxen
    5544          64 :     };
    5545          64 :     EVT VT = Op.getValueType();
    5546             : 
    5547             :     auto *M = cast<MemSDNode>(Op);
    5548             :     unsigned Opcode = 0;
    5549             : 
    5550             :     switch (IntrID) {
    5551             :     case Intrinsic::amdgcn_struct_buffer_atomic_swap:
    5552             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    5553             :       break;
    5554          19 :     case Intrinsic::amdgcn_struct_buffer_atomic_add:
    5555             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    5556          19 :       break;
    5557          17 :     case Intrinsic::amdgcn_struct_buffer_atomic_sub:
    5558             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    5559          17 :       break;
    5560           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_smin:
    5561             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    5562           2 :       break;
    5563           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_umin:
    5564             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    5565           2 :       break;
    5566           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_smax:
    5567             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    5568           2 :       break;
    5569           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_umax:
    5570             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    5571           2 :       break;
    5572           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_and:
    5573             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    5574           2 :       break;
    5575           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_or:
    5576             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    5577           2 :       break;
    5578           2 :     case Intrinsic::amdgcn_struct_buffer_atomic_xor:
    5579             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    5580           2 :       break;
    5581           0 :     default:
    5582           0 :       llvm_unreachable("unhandled atomic opcode");
    5583             :     }
    5584             : 
    5585             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    5586         192 :                                    M->getMemOperand());
    5587             :   }
    5588          12 :   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
    5589          24 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
    5590             :     unsigned IdxEn = 1;
    5591             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
    5592          16 :       IdxEn = Idx->getZExtValue() != 0;
    5593             :     SDValue Ops[] = {
    5594             :       Op.getOperand(0), // Chain
    5595             :       Op.getOperand(2), // src
    5596             :       Op.getOperand(3), // cmp
    5597             :       Op.getOperand(4), // rsrc
    5598             :       Op.getOperand(5), // vindex
    5599             :       SDValue(),        // voffset -- will be set by setBufferOffsets
    5600             :       SDValue(),        // soffset -- will be set by setBufferOffsets
    5601             :       SDValue(),        // offset -- will be set by setBufferOffsets
    5602          12 :       DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
    5603          12 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
    5604          12 :     };
    5605          12 :     setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
    5606          12 :     EVT VT = Op.getValueType();
    5607             :     auto *M = cast<MemSDNode>(Op);
    5608             : 
    5609             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5610          36 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5611             :   }
    5612           8 :   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
    5613           8 :     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
    5614             :     SDValue Ops[] = {
    5615             :       Op.getOperand(0), // Chain
    5616             :       Op.getOperand(2), // src
    5617             :       Op.getOperand(3), // cmp
    5618             :       Op.getOperand(4), // rsrc
    5619           8 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5620             :       Offsets.first,    // voffset
    5621             :       Op.getOperand(6), // soffset
    5622             :       Offsets.second,   // offset
    5623             :       Op.getOperand(7), // cachepolicy
    5624           8 :       DAG.getConstant(0, DL, MVT::i1), // idxen
    5625           8 :     };
    5626           8 :     EVT VT = Op.getValueType();
    5627             :     auto *M = cast<MemSDNode>(Op);
    5628             : 
    5629             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5630          24 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5631             :   }
    5632          12 :   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
    5633          12 :     auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
    5634             :     SDValue Ops[] = {
    5635             :       Op.getOperand(0), // Chain
    5636             :       Op.getOperand(2), // src
    5637             :       Op.getOperand(3), // cmp
    5638             :       Op.getOperand(4), // rsrc
    5639             :       Op.getOperand(5), // vindex
    5640             :       Offsets.first,    // voffset
    5641             :       Op.getOperand(7), // soffset
    5642             :       Offsets.second,   // offset
    5643             :       Op.getOperand(8), // cachepolicy
    5644          12 :       DAG.getConstant(1, DL, MVT::i1), // idxen
    5645          12 :     };
    5646          12 :     EVT VT = Op.getValueType();
    5647             :     auto *M = cast<MemSDNode>(Op);
    5648             : 
    5649             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5650          36 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5651             :   }
    5652             : 
    5653         673 :   default:
    5654         673 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    5655         673 :             AMDGPU::getImageDimIntrinsicInfo(IntrID))
    5656         673 :       return lowerImage(Op, ImageDimIntr, DAG);
    5657             : 
    5658          52 :     return SDValue();
    5659             :   }
    5660             : }
    5661             : 
    5662          66 : SDValue SITargetLowering::handleD16VData(SDValue VData,
    5663             :                                          SelectionDAG &DAG) const {
    5664          66 :   EVT StoreVT = VData.getValueType();
    5665             : 
    5666             :   // No change for f16 and legal vector D16 types.
    5667          66 :   if (!StoreVT.isVector())
    5668          21 :     return VData;
    5669             : 
    5670             :   SDLoc DL(VData);
    5671             :   assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
    5672             : 
    5673          45 :   if (Subtarget->hasUnpackedD16VMem()) {
    5674             :     // We need to unpack the packed data to store.
    5675          15 :     EVT IntStoreVT = StoreVT.changeTypeToInteger();
    5676          15 :     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
    5677             : 
    5678          15 :     EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    5679          15 :                                         StoreVT.getVectorNumElements());
    5680          15 :     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
    5681          15 :     return DAG.UnrollVectorOp(ZExt.getNode());
    5682             :   }
    5683             : 
    5684             :   assert(isTypeLegal(StoreVT));
    5685          30 :   return VData;
    5686             : }
    5687             : 
    5688        2731 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
    5689             :                                               SelectionDAG &DAG) const {
    5690             :   SDLoc DL(Op);
    5691        2731 :   SDValue Chain = Op.getOperand(0);
    5692        2731 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    5693        2731 :   MachineFunction &MF = DAG.getMachineFunction();
    5694             : 
    5695        2731 :   switch (IntrinsicID) {
    5696         403 :   case Intrinsic::amdgcn_exp: {
    5697             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5698             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5699             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
    5700             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
    5701             : 
    5702             :     const SDValue Ops[] = {
    5703             :       Chain,
    5704         806 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5705         806 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5706             :       Op.getOperand(4), // src0
    5707             :       Op.getOperand(5), // src1
    5708             :       Op.getOperand(6), // src2
    5709             :       Op.getOperand(7), // src3
    5710         403 :       DAG.getTargetConstant(0, DL, MVT::i1), // compr
    5711         403 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5712         806 :     };
    5713             : 
    5714         806 :     unsigned Opc = Done->isNullValue() ?
    5715             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5716         806 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5717             :   }
    5718          96 :   case Intrinsic::amdgcn_exp_compr: {
    5719             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5720             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5721          96 :     SDValue Src0 = Op.getOperand(4);
    5722          96 :     SDValue Src1 = Op.getOperand(5);
    5723             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
    5724             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
    5725             : 
    5726          96 :     SDValue Undef = DAG.getUNDEF(MVT::f32);
    5727             :     const SDValue Ops[] = {
    5728             :       Chain,
    5729         192 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5730         192 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5731          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
    5732          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
    5733             :       Undef, // src2
    5734             :       Undef, // src3
    5735          96 :       DAG.getTargetConstant(1, DL, MVT::i1), // compr
    5736          96 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5737         192 :     };
    5738             : 
    5739         192 :     unsigned Opc = Done->isNullValue() ?
    5740             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5741         192 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5742             :   }
    5743          28 :   case Intrinsic::amdgcn_s_sendmsg:
    5744             :   case Intrinsic::amdgcn_s_sendmsghalt: {
    5745          28 :     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
    5746             :       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
    5747          28 :     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    5748          28 :     SDValue Glue = Chain.getValue(1);
    5749             :     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
    5750          28 :                        Op.getOperand(2), Glue);
    5751             :   }
    5752             :   case Intrinsic::amdgcn_init_exec: {
    5753             :     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
    5754           3 :                        Op.getOperand(2));
    5755             :   }
    5756             :   case Intrinsic::amdgcn_init_exec_from_input: {
    5757             :     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
    5758           4 :                        Op.getOperand(2), Op.getOperand(3));
    5759             :   }
    5760          31 :   case AMDGPUIntrinsic::AMDGPU_kill: {
    5761          31 :     SDValue Src = Op.getOperand(2);
    5762             :     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
    5763          22 :       if (!K->isNegative())
    5764           4 :         return Chain;
    5765             : 
    5766           7 :       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
    5767           7 :       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
    5768             :     }
    5769             : 
    5770          20 :     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
    5771          20 :     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
    5772             :   }
    5773         145 :   case Intrinsic::amdgcn_s_barrier: {
    5774         145 :     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
    5775         135 :       const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    5776         135 :       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
    5777         135 :       if (WGSize <= ST.getWavefrontSize())
    5778           5 :         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
    5779           5 :                                           Op.getOperand(0)), 0);
    5780             :     }
    5781         140 :     return SDValue();
    5782             :   };
    5783          14 :   case AMDGPUIntrinsic::SI_tbuffer_store: {
    5784             : 
    5785             :     // Extract vindex and voffset from vaddr as appropriate
    5786             :     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
    5787             :     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
    5788          14 :     SDValue VAddr = Op.getOperand(5);
    5789             : 
    5790          14 :     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
    5791             : 
    5792             :     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
    5793             :            "Legacy intrinsic doesn't support both offset and index - use new version");
    5794             : 
    5795          28 :     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
    5796          32 :     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
    5797             : 
    5798             :     // Deal with the vec-3 case
    5799             :     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
    5800          28 :     auto Opcode = NumChannels->getZExtValue() == 3 ?
    5801             :       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
    5802             : 
    5803          28 :     unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
    5804          28 :     unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
    5805          28 :     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
    5806          14 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
    5807             :     SDValue Ops[] = {
    5808             :      Chain,
    5809             :      Op.getOperand(3),  // vdata
    5810             :      Op.getOperand(2),  // rsrc
    5811             :      VIndex,
    5812             :      VOffset,
    5813             :      Op.getOperand(6),  // soffset
    5814             :      Op.getOperand(7),  // inst_offset
    5815          14 :      DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
    5816          14 :      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
    5817          28 :      DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
    5818          14 :     };
    5819             : 
    5820             :     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
    5821             :            "Value of tfe other than zero is unsupported");
    5822             : 
    5823          28 :     EVT VT = Op.getOperand(3).getValueType();
    5824          28 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    5825             :       MachinePointerInfo(),
    5826             :       MachineMemOperand::MOStore,
    5827             :       VT.getStoreSize(), 4);
    5828             :     return DAG.getMemIntrinsicNode(Opcode, DL,
    5829          28 :                                    Op->getVTList(), Ops, VT, MMO);
    5830             :   }
    5831             : 
    5832          41 :   case Intrinsic::amdgcn_tbuffer_store: {
    5833          41 :     SDValue VData = Op.getOperand(2);
    5834          41 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5835          41 :     if (IsD16)
    5836           9 :       VData = handleD16VData(VData, DAG);
    5837          82 :     unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
    5838          82 :     unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
    5839          82 :     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
    5840          82 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
    5841             :     unsigned IdxEn = 1;
    5842             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
    5843          36 :       IdxEn = Idx->getZExtValue() != 0;
    5844             :     SDValue Ops[] = {
    5845             :       Chain,
    5846             :       VData,             // vdata
    5847             :       Op.getOperand(3),  // rsrc
    5848             :       Op.getOperand(4),  // vindex
    5849             :       Op.getOperand(5),  // voffset
    5850             :       Op.getOperand(6),  // soffset
    5851             :       Op.getOperand(7),  // offset
    5852          41 :       DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
    5853          41 :       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
    5854          41 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
    5855          41 :     };
    5856          41 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5857             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5858             :     MemSDNode *M = cast<MemSDNode>(Op);
    5859             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5860         123 :                                    M->getMemoryVT(), M->getMemOperand());
    5861             :   }
    5862             : 
    5863          35 :   case Intrinsic::amdgcn_struct_tbuffer_store: {
    5864          35 :     SDValue VData = Op.getOperand(2);
    5865          35 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5866          35 :     if (IsD16)
    5867           9 :       VData = handleD16VData(VData, DAG);
    5868          35 :     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
    5869             :     SDValue Ops[] = {
    5870             :       Chain,
    5871             :       VData,             // vdata
    5872             :       Op.getOperand(3),  // rsrc
    5873             :       Op.getOperand(4),  // vindex
    5874             :       Offsets.first,     // voffset
    5875             :       Op.getOperand(6),  // soffset
    5876             :       Offsets.second,    // offset
    5877             :       Op.getOperand(7),  // format
    5878             :       Op.getOperand(8),  // cachepolicy
    5879          35 :       DAG.getConstant(1, DL, MVT::i1), // idexen
    5880          70 :     };
    5881          35 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5882             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5883             :     MemSDNode *M = cast<MemSDNode>(Op);
    5884             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5885         105 :                                    M->getMemoryVT(), M->getMemOperand());
    5886             :   }
    5887             : 
    5888          27 :   case Intrinsic::amdgcn_raw_tbuffer_store: {
    5889          27 :     SDValue VData = Op.getOperand(2);
    5890          27 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5891          27 :     if (IsD16)
    5892           9 :       VData = handleD16VData(VData, DAG);
    5893          27 :     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
    5894             :     SDValue Ops[] = {
    5895             :       Chain,
    5896             :       VData,             // vdata
    5897             :       Op.getOperand(3),  // rsrc
    5898          27 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5899             :       Offsets.first,     // voffset
    5900             :       Op.getOperand(5),  // soffset
    5901             :       Offsets.second,    // offset
    5902             :       Op.getOperand(6),  // format
    5903             :       Op.getOperand(7),  // cachepolicy
    5904          27 :       DAG.getConstant(0, DL, MVT::i1), // idexen
    5905          54 :     };
    5906          27 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5907             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5908             :     MemSDNode *M = cast<MemSDNode>(Op);
    5909             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5910          81 :                                    M->getMemoryVT(), M->getMemOperand());
    5911             :   }
    5912             : 
    5913         153 :   case Intrinsic::amdgcn_buffer_store:
    5914             :   case Intrinsic::amdgcn_buffer_store_format: {
    5915         153 :     SDValue VData = Op.getOperand(2);
    5916         153 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5917         153 :     if (IsD16)
    5918           9 :       VData = handleD16VData(VData, DAG);
    5919         306 :     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
    5920         306 :     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
    5921             :     unsigned IdxEn = 1;
    5922             :     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
    5923         180 :       IdxEn = Idx->getZExtValue() != 0;
    5924             :     SDValue Ops[] = {
    5925             :       Chain,
    5926             :       VData,
    5927             :       Op.getOperand(3), // rsrc
    5928             :       Op.getOperand(4), // vindex
    5929             :       SDValue(), // voffset -- will be set by setBufferOffsets
    5930             :       SDValue(), // soffset -- will be set by setBufferOffsets
    5931             :       SDValue(), // offset -- will be set by setBufferOffsets
    5932         153 :       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
    5933         153 :       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
    5934         153 :     };
    5935         153 :     setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
    5936         153 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
    5937             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5938         153 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5939             :     MemSDNode *M = cast<MemSDNode>(Op);
    5940             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5941         459 :                                    M->getMemoryVT(), M->getMemOperand());
    5942             :   }
    5943             : 
    5944          95 :   case Intrinsic::amdgcn_raw_buffer_store:
    5945             :   case Intrinsic::amdgcn_raw_buffer_store_format: {
    5946          95 :     SDValue VData = Op.getOperand(2);
    5947          95 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5948          95 :     if (IsD16)
    5949           9 :       VData = handleD16VData(VData, DAG);
    5950          95 :     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
    5951             :     SDValue Ops[] = {
    5952             :       Chain,
    5953             :       VData,
    5954             :       Op.getOperand(3), // rsrc
    5955          95 :       DAG.getConstant(0, DL, MVT::i32), // vindex
    5956             :       Offsets.first,    // voffset
    5957             :       Op.getOperand(5), // soffset
    5958             :       Offsets.second,   // offset
    5959             :       Op.getOperand(6), // cachepolicy
    5960          95 :       DAG.getConstant(0, DL, MVT::i1), // idxen
    5961         190 :     };
    5962          95 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
    5963             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5964          95 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5965             :     MemSDNode *M = cast<MemSDNode>(Op);
    5966             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5967         285 :                                    M->getMemoryVT(), M->getMemOperand());
    5968             :   }
    5969             : 
    5970          63 :   case Intrinsic::amdgcn_struct_buffer_store:
    5971             :   case Intrinsic::amdgcn_struct_buffer_store_format: {
    5972          63 :     SDValue VData = Op.getOperand(2);
    5973          63 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5974          63 :     if (IsD16)
    5975           9 :       VData = handleD16VData(VData, DAG);
    5976          63 :     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
    5977             :     SDValue Ops[] = {
    5978             :       Chain,
    5979             :       VData,
    5980             :       Op.getOperand(3), // rsrc
    5981             :       Op.getOperand(4), // vindex
    5982             :       Offsets.first,    // voffset
    5983             :       Op.getOperand(6), // soffset
    5984             :       Offsets.second,   // offset
    5985             :       Op.getOperand(7), // cachepolicy
    5986          63 :       DAG.getConstant(1, DL, MVT::i1), // idxen
    5987         126 :     };
    5988          63 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
    5989             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5990          63 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5991             :     MemSDNode *M = cast<MemSDNode>(Op);
    5992             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5993         189 :                                    M->getMemoryVT(), M->getMemOperand());
    5994             :   }
    5995             : 
    5996        1593 :   default: {
    5997        1593 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    5998        1593 :             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
    5999          87 :       return lowerImage(Op, ImageDimIntr, DAG);
    6000             : 
    6001        1506 :     return Op;
    6002             :   }
    6003             :   }
    6004             : }
    6005             : 
    6006             : // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
    6007             : // offset (the offset that is included in bounds checking and swizzling, to be
    6008             : // split between the instruction's voffset and immoffset fields) and soffset
    6009             : // (the offset that is excluded from bounds checking and swizzling, to go in
    6010             : // the instruction's soffset field).  This function takes the first kind of
    6011             : // offset and figures out how to split it between voffset and immoffset.
    6012         610 : std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
    6013             :     SDValue Offset, SelectionDAG &DAG) const {
    6014             :   SDLoc DL(Offset);
    6015             :   const unsigned MaxImm = 4095;
    6016         610 :   SDValue N0 = Offset;
    6017             :   ConstantSDNode *C1 = nullptr;
    6018         610 :   if (N0.getOpcode() == ISD::ADD) {
    6019             :     if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
    6020          80 :       N0 = N0.getOperand(0);
    6021             :   } else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
    6022         407 :     N0 = SDValue();
    6023             : 
    6024         610 :   if (C1) {
    6025         487 :     unsigned ImmOffset = C1->getZExtValue();
    6026             :     // If the immediate value is too big for the immoffset field, put the value
    6027             :     // and -4096 into the immoffset field so that the value that is copied/added
    6028             :     // for the voffset field is a multiple of 4096, and it stands more chance
    6029             :     // of being CSEd with the copy/add for another similar load/store.
    6030             :     // However, do not do that rounding down to a multiple of 4096 if that is a
    6031             :     // negative number, as it appears to be illegal to have a negative offset
    6032             :     // in the vgpr, even if adding the immediate offset makes it positive.
    6033         487 :     unsigned Overflow = ImmOffset & ~MaxImm;
    6034         487 :     ImmOffset -= Overflow;
    6035         487 :     if ((int32_t)Overflow < 0) {
    6036             :       Overflow += ImmOffset;
    6037             :       ImmOffset = 0;
    6038             :     }
    6039         487 :     C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
    6040         487 :     if (Overflow) {
    6041           4 :       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
    6042           4 :       if (!N0)
    6043           0 :         N0 = OverflowVal;
    6044             :       else {
    6045           4 :         SDValue Ops[] = { N0, OverflowVal };
    6046           4 :         N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
    6047             :       }
    6048             :     }
    6049             :   }
    6050         610 :   if (!N0)
    6051         407 :     N0 = DAG.getConstant(0, DL, MVT::i32);
    6052         610 :   if (!C1)
    6053         123 :     C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
    6054         610 :   return {N0, SDValue(C1, 0)};
    6055             : }
    6056             : 
    6057             : // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
    6058             : // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
    6059             : // pointed to by Offsets.
    6060         489 : void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
    6061             :                                         SelectionDAG &DAG, SDValue *Offsets,
    6062             :                                         unsigned Align) const {
    6063             :   SDLoc DL(CombinedOffset);
    6064             :   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
    6065         296 :     uint32_t Imm = C->getZExtValue();
    6066             :     uint32_t SOffset, ImmOffset;
    6067         296 :     if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
    6068         288 :       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
    6069         288 :       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
    6070         288 :       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
    6071         288 :       return;
    6072             :     }
    6073             :   }
    6074         201 :   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
    6075         106 :     SDValue N0 = CombinedOffset.getOperand(0);
    6076         106 :     SDValue N1 = CombinedOffset.getOperand(1);
    6077             :     uint32_t SOffset, ImmOffset;
    6078         106 :     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
    6079         210 :     if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
    6080         104 :                                                 Subtarget, Align)) {
    6081          98 :       Offsets[0] = N0;
    6082          98 :       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
    6083          98 :       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
    6084          98 :       return;
    6085             :     }
    6086             :   }
    6087         103 :   Offsets[0] = CombinedOffset;
    6088         103 :   Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
    6089         103 :   Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
    6090             : }
    6091             : 
    6092          48 : static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
    6093             :                                  ISD::LoadExtType ExtType, SDValue Op,
    6094             :                                  const SDLoc &SL, EVT VT) {
    6095          48 :   if (VT.bitsLT(Op.getValueType()))
    6096          16 :     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
    6097             : 
    6098          32 :   switch (ExtType) {
    6099             :   case ISD::SEXTLOAD:
    6100           2 :     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
    6101             :   case ISD::ZEXTLOAD:
    6102          20 :     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
    6103             :   case ISD::EXTLOAD:
    6104          10 :     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
    6105           0 :   case ISD::NON_EXTLOAD:
    6106           0 :     return Op;
    6107             :   }
    6108             : 
    6109           0 :   llvm_unreachable("invalid ext type");
    6110             : }
    6111             : 
    6112      310576 : SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
    6113      310576 :   SelectionDAG &DAG = DCI.DAG;
    6114      310576 :   if (Ld->getAlignment() < 4 || Ld->isDivergent())
    6115       73345 :     return SDValue();
    6116             : 
    6117             :   // FIXME: Constant loads should all be marked invariant.
    6118             :   unsigned AS = Ld->getAddressSpace();
    6119      474462 :   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
    6120      237231 :       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
    6121       35839 :       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
    6122       85531 :     return SDValue();
    6123             : 
    6124             :   // Don't do this early, since it may interfere with adjacent load merging for
    6125             :   // illegal types. We can avoid losing alignment information for exotic types
    6126             :   // pre-legalize.
    6127      151700 :   EVT MemVT = Ld->getMemoryVT();
    6128      151700 :   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
    6129       41428 :       MemVT.getSizeInBits() >= 32)
    6130      151652 :     return SDValue();
    6131             : 
    6132             :   SDLoc SL(Ld);
    6133             : 
    6134             :   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
    6135             :          "unexpected vector extload");
    6136             : 
    6137             :   // TODO: Drop only high part of range.
    6138          48 :   SDValue Ptr = Ld->getBasePtr();
    6139             :   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
    6140             :                                 MVT::i32, SL, Ld->getChain(), Ptr,
    6141             :                                 Ld->getOffset(),
    6142          48 :                                 Ld->getPointerInfo(), MVT::i32,
    6143             :                                 Ld->getAlignment(),
    6144          48 :                                 Ld->getMemOperand()->getFlags(),
    6145          48 :                                 Ld->getAAInfo(),
    6146          48 :                                 nullptr); // Drop ranges
    6147             : 
    6148          48 :   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
    6149          48 :   if (MemVT.isFloatingPoint()) {
    6150             :     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
    6151             :            "unexpected fp extload");
    6152           0 :     TruncVT = MemVT.changeTypeToInteger();
    6153             :   }
    6154             : 
    6155          48 :   SDValue Cvt = NewLoad;
    6156          48 :   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
    6157           2 :     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
    6158           2 :                       DAG.getValueType(TruncVT));
    6159          46 :   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
    6160             :              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
    6161          35 :     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
    6162             :   } else {
    6163             :     assert(Ld->getExtensionType() == ISD::EXTLOAD);
    6164             :   }
    6165             : 
    6166          48 :   EVT VT = Ld->getValueType(0);
    6167          48 :   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
    6168             : 
    6169          48 :   DCI.AddToWorklist(Cvt.getNode());
    6170             : 
    6171             :   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
    6172             :   // the appropriate extension from the 32-bit load.
    6173          48 :   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
    6174          48 :   DCI.AddToWorklist(Cvt.getNode());
    6175             : 
    6176             :   // Handle conversion back to floating point if necessary.
    6177          48 :   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
    6178             : 
    6179          96 :   return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
    6180             : }
    6181             : 
    6182       74303 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    6183             :   SDLoc DL(Op);
    6184             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    6185             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    6186       74303 :   EVT MemVT = Load->getMemoryVT();
    6187             : 
    6188       74303 :   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
    6189             :     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
    6190        2253 :       return SDValue();
    6191             : 
    6192             :     // FIXME: Copied from PPC
    6193             :     // First, load into 32 bits, then truncate to 1 bit.
    6194             : 
    6195         120 :     SDValue Chain = Load->getChain();
    6196         120 :     SDValue BasePtr = Load->getBasePtr();
    6197         120 :     MachineMemOperand *MMO = Load->getMemOperand();
    6198             : 
    6199             :     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
    6200             : 
    6201             :     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
    6202         120 :                                    BasePtr, RealMemVT, MMO);
    6203             : 
    6204             :     SDValue Ops[] = {
    6205         120 :       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
    6206         120 :       NewLD.getValue(1)
    6207             :     };
    6208             : 
    6209         120 :     return DAG.getMergeValues(Ops, DL);
    6210             :   }
    6211             : 
    6212       71930 :   if (!MemVT.isVector())
    6213           0 :     return SDValue();
    6214             : 
    6215             :   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
    6216             :          "Custom lowering for non-i32 vectors hasn't been implemented.");
    6217             : 
    6218       71930 :   unsigned Alignment = Load->getAlignment();
    6219             :   unsigned AS = Load->getAddressSpace();
    6220       71930 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
    6221             :                           AS, Alignment)) {
    6222           0 :     SDValue Ops[2];
    6223           0 :     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
    6224           0 :     return DAG.getMergeValues(Ops, DL);
    6225             :   }
    6226             : 
    6227       71930 :   MachineFunction &MF = DAG.getMachineFunction();
    6228       71930 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    6229             :   // If there is a possibilty that flat instruction access scratch memory
    6230             :   // then we need to use the same legalization rules we use for private.
    6231       71930 :   if (AS == AMDGPUAS::FLAT_ADDRESS)
    6232          27 :     AS = MFI->hasFlatScratchInit() ?
    6233             :          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
    6234             : 
    6235             :   unsigned NumElements = MemVT.getVectorNumElements();
    6236             : 
    6237      143860 :   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
    6238       71930 :       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
    6239       47010 :     if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
    6240       46756 :       return SDValue();
    6241             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    6242             :     // have the same legalization requirements as global and private
    6243             :     // loads.
    6244             :     //
    6245             :   }
    6246             : 
    6247       25174 :   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
    6248       24920 :       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
    6249             :       AS == AMDGPUAS::GLOBAL_ADDRESS) {
    6250        7618 :     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
    6251        2443 :         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
    6252       14761 :         Alignment >= 4 && NumElements < 32)
    6253         841 :       return SDValue();
    6254             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    6255             :     // have the same legalization requirements as global and private
    6256             :     // loads.
    6257             :     //
    6258             :   }
    6259       24333 :   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
    6260             :       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
    6261       24079 :       AS == AMDGPUAS::GLOBAL_ADDRESS ||
    6262             :       AS == AMDGPUAS::FLAT_ADDRESS) {
    6263       13059 :     if (NumElements > 4)
    6264        1263 :       return SplitVectorLoad(Op, DAG);
    6265             :     // v4 loads are supported for private and global memory.
    6266       11796 :     return SDValue();
    6267             :   }
    6268       11274 :   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
    6269             :     // Depending on the setting of the private_element_size field in the
    6270             :     // resource descriptor, we can only make private accesses up to a certain
    6271             :     // size.
    6272         379 :     switch (Subtarget->getMaxPrivateElementSize()) {
    6273         216 :     case 4:
    6274         216 :       return scalarizeVectorLoad(Load, DAG);
    6275          53 :     case 8:
    6276          53 :       if (NumElements > 2)
    6277           5 :         return SplitVectorLoad(Op, DAG);
    6278          48 :       return SDValue();
    6279         110 :     case 16:
    6280             :       // Same as global/flat
    6281         110 :       if (NumElements > 4)
    6282           1 :         return SplitVectorLoad(Op, DAG);
    6283         109 :       return SDValue();
    6284           0 :     default:
    6285           0 :       llvm_unreachable("unsupported private_element_size");
    6286             :     }
    6287       10895 :   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
    6288             :     // Use ds_read_b128 if possible.
    6289       12303 :     if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
    6290             :         MemVT.getStoreSize() == 16)
    6291        1102 :       return SDValue();
    6292             : 
    6293        9793 :     if (NumElements > 2)
    6294        1224 :       return SplitVectorLoad(Op, DAG);
    6295             : 
    6296             :     // SI has a hardware bug in the LDS / GDS boounds checking: if the base
    6297             :     // address is negative, then the instruction is incorrectly treated as
    6298             :     // out-of-bounds even if base + offsets is in bounds. Split vectorized
    6299             :     // loads here to avoid emitting ds_read2_b32. We may re-combine the
    6300             :     // load later in the SILoadStoreOptimizer.
    6301        3336 :     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
    6302       15241 :         NumElements == 2 && MemVT.getStoreSize() == 8 &&
    6303        3336 :         Load->getAlignment() < 8) {
    6304          19 :       return SplitVectorLoad(Op, DAG);
    6305             :     }
    6306             :   }
    6307        8550 :   return SDValue();
    6308             : }
    6309             : 
    6310         734 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    6311         734 :   EVT VT = Op.getValueType();
    6312             :   assert(VT.getSizeInBits() == 64);
    6313             : 
    6314             :   SDLoc DL(Op);
    6315         734 :   SDValue Cond = Op.getOperand(0);
    6316             : 
    6317         734 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    6318         734 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    6319             : 
    6320         734 :   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
    6321         734 :   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
    6322             : 
    6323         734 :   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
    6324         734 :   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
    6325             : 
    6326         734 :   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
    6327             : 
    6328         734 :   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
    6329         734 :   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
    6330             : 
    6331         734 :   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
    6332             : 
    6333        1468 :   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
    6334         734 :   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
    6335             : }
    6336             : 
    6337             : // Catch division cases where we can use shortcuts with rcp and rsq
    6338             : // instructions.
    6339         190 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
    6340             :                                               SelectionDAG &DAG) const {
    6341             :   SDLoc SL(Op);
    6342         190 :   SDValue LHS = Op.getOperand(0);
    6343         190 :   SDValue RHS = Op.getOperand(1);
    6344         190 :   EVT VT = Op.getValueType();
    6345         190 :   const SDNodeFlags Flags = Op->getFlags();
    6346         190 :   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
    6347             : 
    6348         153 :   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
    6349          18 :     return SDValue();
    6350             : 
    6351             :   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    6352         112 :     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    6353         112 :       if (CLHS->isExactlyValue(1.0)) {
    6354             :         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
    6355             :         // the CI documentation has a worst case error of 1 ulp.
    6356             :         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
    6357             :         // use it as long as we aren't trying to use denormals.
    6358             :         //
    6359             :         // v_rcp_f16 and v_rsq_f16 DO support denormals.
    6360             : 
    6361             :         // 1.0 / sqrt(x) -> rsq(x)
    6362             : 
    6363             :         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
    6364             :         // error seems really high at 2^29 ULP.
    6365          75 :         if (RHS.getOpcode() == ISD::FSQRT)
    6366           7 :           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
    6367             : 
    6368             :         // 1.0 / x -> rcp(x)
    6369          68 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    6370             :       }
    6371             : 
    6372             :       // Same as for 1.0, but expand the sign out of the constant.
    6373          37 :       if (CLHS->isExactlyValue(-1.0)) {
    6374             :         // -1.0 / x -> rcp (fneg x)
    6375          34 :         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    6376          34 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    6377             :       }
    6378             :     }
    6379             :   }
    6380             : 
    6381          63 :   if (Unsafe) {
    6382             :     // Turn into multiply by the reciprocal.
    6383             :     // x / y -> x * (1.0 / y)
    6384          12 :     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    6385          12 :     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
    6386             :   }
    6387             : 
    6388          51 :   return SDValue();
    6389             : }
    6390             : 
    6391           0 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    6392             :                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
    6393           0 :   if (GlueChain->getNumValues() <= 1) {
    6394           0 :     return DAG.getNode(Opcode, SL, VT, A, B);
    6395             :   }
    6396             : 
    6397             :   assert(GlueChain->getNumValues() == 3);
    6398             : 
    6399           0 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    6400           0 :   switch (Opcode) {
    6401           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    6402           0 :   case ISD::FMUL:
    6403             :     Opcode = AMDGPUISD::FMUL_W_CHAIN;
    6404             :     break;
    6405             :   }
    6406             : 
    6407             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
    6408           0 :                      GlueChain.getValue(2));
    6409             : }
    6410             : 
    6411           0 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    6412             :                            EVT VT, SDValue A, SDValue B, SDValue C,
    6413             :                            SDValue GlueChain) {
    6414           0 :   if (GlueChain->getNumValues() <= 1) {
    6415           0 :     return DAG.getNode(Opcode, SL, VT, A, B, C);
    6416             :   }
    6417             : 
    6418             :   assert(GlueChain->getNumValues() == 3);
    6419             : 
    6420           0 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    6421           0 :   switch (Opcode) {
    6422           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    6423           0 :   case ISD::FMA:
    6424             :     Opcode = AMDGPUISD::FMA_W_CHAIN;
    6425             :     break;
    6426             :   }
    6427             : 
    6428             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
    6429           0 :                      GlueChain.getValue(2));
    6430             : }
    6431             : 
    6432          27 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
    6433          27 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    6434          24 :     return FastLowered;
    6435             : 
    6436             :   SDLoc SL(Op);
    6437           3 :   SDValue Src0 = Op.getOperand(0);
    6438           3 :   SDValue Src1 = Op.getOperand(1);
    6439             : 
    6440           3 :   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    6441           3 :   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    6442             : 
    6443           3 :   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
    6444           3 :   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
    6445             : 
    6446           3 :   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
    6447           3 :   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
    6448             : 
    6449           3 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
    6450             : }
    6451             : 
    6452             : // Faster 2.5 ULP division that does not support denormals.
    6453          33 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
    6454             :   SDLoc SL(Op);
    6455          33 :   SDValue LHS = Op.getOperand(1);
    6456          33 :   SDValue RHS = Op.getOperand(2);
    6457             : 
    6458          33 :   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
    6459             : 
    6460          33 :   const APFloat K0Val(BitsToFloat(0x6f800000));
    6461          33 :   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
    6462             : 
    6463          33 :   const APFloat K1Val(BitsToFloat(0x2f800000));
    6464          33 :   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
    6465             : 
    6466          33 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    6467             : 
    6468             :   EVT SetCCVT =
    6469          33 :     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
    6470             : 
    6471          33 :   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
    6472             : 
    6473          33 :   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
    6474             : 
    6475             :   // TODO: Should this propagate fast-math-flags?
    6476          33 :   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
    6477             : 
    6478             :   // rcp does not support denormals.
    6479          33 :   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
    6480             : 
    6481          33 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
    6482             : 
    6483          33 :   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
    6484             : }
    6485             : 
    6486         156 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
    6487         156 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    6488          90 :     return FastLowered;
    6489             : 
    6490             :   SDLoc SL(Op);
    6491          66 :   SDValue LHS = Op.getOperand(0);
    6492          66 :   SDValue RHS = Op.getOperand(1);
    6493             : 
    6494          66 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    6495             : 
    6496          66 :   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
    6497             : 
    6498             :   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    6499          66 :                                           RHS, RHS, LHS);
    6500             :   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    6501          66 :                                         LHS, RHS, LHS);
    6502             : 
    6503             :   // Denominator is scaled to not be denormal, so using rcp is ok.
    6504             :   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
    6505          66 :                                   DenominatorScaled);
    6506             :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
    6507          66 :                                      DenominatorScaled);
    6508             : 
    6509             :   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
    6510             :                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
    6511             :                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
    6512             : 
    6513          66 :   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
    6514             : 
    6515          66 :   if (!Subtarget->hasFP32Denormals()) {
    6516          48 :     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
    6517             :     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
    6518          48 :                                                       SL, MVT::i32);
    6519             :     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
    6520             :                                        DAG.getEntryNode(),
    6521          48 :                                        EnableDenormValue, BitField);
    6522             :     SDValue Ops[3] = {
    6523             :       NegDivScale0,
    6524          48 :       EnableDenorm.getValue(0),
    6525          48 :       EnableDenorm.getValue(1)
    6526          48 :     };
    6527             : 
    6528          48 :     NegDivScale0 = DAG.getMergeValues(Ops, SL);
    6529             :   }
    6530             : 
    6531             :   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
    6532          66 :                              ApproxRcp, One, NegDivScale0);
    6533             : 
    6534             :   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
    6535          66 :                              ApproxRcp, Fma0);
    6536             : 
    6537             :   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
    6538          66 :                            Fma1, Fma1);
    6539             : 
    6540             :   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
    6541          66 :                              NumeratorScaled, Mul);
    6542             : 
    6543          66 :   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
    6544             : 
    6545             :   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
    6546          66 :                              NumeratorScaled, Fma3);
    6547             : 
    6548          66 :   if (!Subtarget->hasFP32Denormals()) {
    6549             :     const SDValue DisableDenormValue =
    6550          48 :         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
    6551             :     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
    6552             :                                         Fma4.getValue(1),
    6553             :                                         DisableDenormValue,
    6554             :                                         BitField,
    6555          48 :                                         Fma4.getValue(2));
    6556             : 
    6557             :     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    6558          48 :                                       DisableDenorm, DAG.getRoot());
    6559          48 :     DAG.setRoot(OutputChain);
    6560             :   }
    6561             : 
    6562          66 :   SDValue Scale = NumeratorScaled.getValue(1);
    6563             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
    6564          66 :                              Fma4, Fma1, Fma3, Scale);
    6565             : 
    6566          66 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
    6567             : }
    6568             : 
    6569          68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
    6570          68 :   if (DAG.getTarget().Options.UnsafeFPMath)
    6571           7 :     return lowerFastUnsafeFDIV(Op, DAG);
    6572             : 
    6573             :   SDLoc SL(Op);
    6574          61 :   SDValue X = Op.getOperand(0);
    6575          61 :   SDValue Y = Op.getOperand(1);
    6576             : 
    6577          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    6578             : 
    6579          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
    6580             : 
    6581          61 :   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
    6582             : 
    6583          61 :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
    6584             : 
    6585          61 :   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
    6586             : 
    6587          61 :   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
    6588             : 
    6589          61 :   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
    6590             : 
    6591          61 :   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
    6592             : 
    6593          61 :   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
    6594             : 
    6595          61 :   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
    6596          61 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
    6597             : 
    6598             :   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
    6599          61 :                              NegDivScale0, Mul, DivScale1);
    6600             : 
    6601          61 :   SDValue Scale;
    6602             : 
    6603          61 :   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
    6604             :     // Workaround a hardware bug on SI where the condition output from div_scale
    6605             :     // is not usable.
    6606             : 
    6607          23 :     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
    6608             : 
    6609             :     // Figure out if the scale to use for div_fmas.
    6610          23 :     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    6611          23 :     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
    6612          23 :     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
    6613          23 :     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
    6614             : 
    6615          23 :     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
    6616          23 :     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
    6617             : 
    6618             :     SDValue Scale0Hi
    6619          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
    6620             :     SDValue Scale1Hi
    6621          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
    6622             : 
    6623          23 :     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
    6624          23 :     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
    6625          23 :     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
    6626             :   } else {
    6627          38 :     Scale = DivScale1.getValue(1);
    6628             :   }
    6629             : 
    6630             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
    6631          61 :                              Fma4, Fma3, Mul, Scale);
    6632             : 
    6633          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
    6634             : }
    6635             : 
    6636         251 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
    6637         251 :   EVT VT = Op.getValueType();
    6638             : 
    6639             :   if (VT == MVT::f32)
    6640         156 :     return LowerFDIV32(Op, DAG);
    6641             : 
    6642             :   if (VT == MVT::f64)
    6643          68 :     return LowerFDIV64(Op, DAG);
    6644             : 
    6645             :   if (VT == MVT::f16)
    6646          27 :     return LowerFDIV16(Op, DAG);
    6647             : 
    6648           0 :   llvm_unreachable("Unexpected type for fdiv");
    6649             : }
    6650             : 
    6651       82361 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    6652             :   SDLoc DL(Op);
    6653             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    6654       82361 :   EVT VT = Store->getMemoryVT();
    6655             : 
    6656             :   if (VT == MVT::i1) {
    6657             :     return DAG.getTruncStore(Store->getChain(), DL,
    6658             :        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
    6659         380 :        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    6660             :   }
    6661             : 
    6662             :   assert(VT.isVector() &&
    6663             :          Store->getValue().getValueType().getScalarType() == MVT::i32);
    6664             : 
    6665             :   unsigned AS = Store->getAddressSpace();
    6666       82171 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
    6667             :                           AS, Store->getAlignment())) {
    6668          24 :     return expandUnalignedStore(Store, DAG);
    6669             :   }
    6670             : 
    6671       82147 :   MachineFunction &MF = DAG.getMachineFunction();
    6672       82147 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    6673             :   // If there is a possibilty that flat instruction access scratch memory
    6674             :   // then we need to use the same legalization rules we use for private.
    6675       82147 :   if (AS == AMDGPUAS::FLAT_ADDRESS)
    6676         261 :     AS = MFI->hasFlatScratchInit() ?
    6677             :          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
    6678             : 
    6679             :   unsigned NumElements = VT.getVectorNumElements();
    6680       82147 :   if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
    6681             :       AS == AMDGPUAS::FLAT_ADDRESS) {
    6682       40502 :     if (NumElements > 4)
    6683        4385 :       return SplitVectorStore(Op, DAG);
    6684       36117 :     return SDValue();
    6685       41645 :   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
    6686         564 :     switch (Subtarget->getMaxPrivateElementSize()) {
    6687         344 :     case 4:
    6688         344 :       return scalarizeVectorStore(Store, DAG);
    6689          86 :     case 8:
    6690          86 :       if (NumElements > 2)
    6691          10 :         return SplitVectorStore(Op, DAG);
    6692          76 :       return SDValue();
    6693         134 :     case 16:
    6694         134 :       if (NumElements > 4)
    6695           2 :         return SplitVectorStore(Op, DAG);
    6696         132 :       return SDValue();
    6697           0 :     default:
    6698           0 :       llvm_unreachable("unsupported private_element_size");
    6699             :     }
    6700       41081 :   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
    6701             :     // Use ds_write_b128 if possible.
    6702       46309 :     if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
    6703             :         VT.getStoreSize() == 16)
    6704        4490 :       return SDValue();
    6705             : 
    6706       36591 :     if (NumElements > 2)
    6707        4042 :       return SplitVectorStore(Op, DAG);
    6708             : 
    6709             :     // SI has a hardware bug in the LDS / GDS boounds checking: if the base
    6710             :     // address is negative, then the instruction is incorrectly treated as
    6711             :     // out-of-bounds even if base + offsets is in bounds. Split vectorized
    6712             :     // stores here to avoid emitting ds_write2_b32. We may re-combine the
    6713             :     // store later in the SILoadStoreOptimizer.
    6714       14225 :     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
    6715       60999 :         NumElements == 2 && VT.getStoreSize() == 8 &&
    6716       14225 :         Store->getAlignment() < 8) {
    6717          33 :       return SplitVectorStore(Op, DAG);
    6718             :     }
    6719             : 
    6720       32516 :     return SDValue();
    6721             :   } else {
    6722           0 :     llvm_unreachable("unhandled address space");
    6723             :   }
    6724             : }
    6725             : 
    6726          98 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    6727             :   SDLoc DL(Op);
    6728          98 :   EVT VT = Op.getValueType();
    6729          98 :   SDValue Arg = Op.getOperand(0);
    6730          98 :   SDValue TrigVal;
    6731             : 
    6732             :   // TODO: Should this propagate fast-math-flags?
    6733             : 
    6734          98 :   SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
    6735             : 
    6736          98 :   if (Subtarget->hasTrigReducedRange()) {
    6737          68 :     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
    6738          68 :     TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
    6739             :   } else {
    6740          30 :     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
    6741             :   }
    6742             : 
    6743          98 :   switch (Op.getOpcode()) {
    6744             :   case ISD::FCOS:
    6745          60 :     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
    6746             :   case ISD::FSIN:
    6747         136 :     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
    6748           0 :   default:
    6749           0 :     llvm_unreachable("Wrong trig opcode");
    6750             :   }
    6751             : }
    6752             : 
    6753         263 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    6754             :   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
    6755             :   assert(AtomicNode->isCompareAndSwap());
    6756             :   unsigned AS = AtomicNode->getAddressSpace();
    6757             : 
    6758             :   // No custom lowering required for local address space
    6759         263 :   if (!isFlatGlobalAddrSpace(AS))
    6760          66 :     return Op;
    6761             : 
    6762             :   // Non-local address space requires custom lowering for atomic compare
    6763             :   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
    6764             :   SDLoc DL(Op);
    6765         197 :   SDValue ChainIn = Op.getOperand(0);
    6766         197 :   SDValue Addr = Op.getOperand(1);
    6767         197 :   SDValue Old = Op.getOperand(2);
    6768         197 :   SDValue New = Op.getOperand(3);
    6769         197 :   EVT VT = Op.getValueType();
    6770         197 :   MVT SimpleVT = VT.getSimpleVT();
    6771         197 :   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
    6772             : 
    6773         394 :   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
    6774         197 :   SDValue Ops[] = { ChainIn, Addr, NewOld };
    6775             : 
    6776             :   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
    6777         591 :                                  Ops, VT, AtomicNode->getMemOperand());
    6778             : }
    6779             : 
    6780             : //===----------------------------------------------------------------------===//
    6781             : // Custom DAG optimizations
    6782             : //===----------------------------------------------------------------------===//
    6783             : 
    6784        1699 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
    6785             :                                                      DAGCombinerInfo &DCI) const {
    6786        1699 :   EVT VT = N->getValueType(0);
    6787        1699 :   EVT ScalarVT = VT.getScalarType();
    6788        1699 :   if (ScalarVT != MVT::f32)
    6789         207 :     return SDValue();
    6790             : 
    6791        1492 :   SelectionDAG &DAG = DCI.DAG;
    6792             :   SDLoc DL(N);
    6793             : 
    6794        1492 :   SDValue Src = N->getOperand(0);
    6795             :   EVT SrcVT = Src.getValueType();
    6796             : 
    6797             :   // TODO: We could try to match extracting the higher bytes, which would be
    6798             :   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
    6799             :   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
    6800             :   // about in practice.
    6801        1492 :   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
    6802        1162 :     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
    6803         107 :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
    6804         107 :       DCI.AddToWorklist(Cvt.getNode());
    6805         107 :       return Cvt;
    6806             :     }
    6807             :   }
    6808             : 
    6809        1385 :   return SDValue();
    6810             : }
    6811             : 
    6812             : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
    6813             : 
    6814             : // This is a variant of
    6815             : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
    6816             : //
    6817             : // The normal DAG combiner will do this, but only if the add has one use since
    6818             : // that would increase the number of instructions.
    6819             : //
    6820             : // This prevents us from seeing a constant offset that can be folded into a
    6821             : // memory instruction's addressing mode. If we know the resulting add offset of
    6822             : // a pointer can be folded into an addressing offset, we can replace the pointer
    6823             : // operand with the add of new constant offset. This eliminates one of the uses,
    6824             : // and may allow the remaining use to also be simplified.
    6825             : //
    6826         213 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    6827             :                                                unsigned AddrSpace,
    6828             :                                                EVT MemVT,
    6829             :                                                DAGCombinerInfo &DCI) const {
    6830         213 :   SDValue N0 = N->getOperand(0);
    6831         213 :   SDValue N1 = N->getOperand(1);
    6832             : 
    6833             :   // We only do this to handle cases where it's profitable when there are
    6834             :   // multiple uses of the add, so defer to the standard combine.
    6835         213 :   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
    6836             :       N0->hasOneUse())
    6837         167 :     return SDValue();
    6838             : 
    6839             :   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
    6840             :   if (!CN1)
    6841           0 :     return SDValue();
    6842             : 
    6843             :   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    6844             :   if (!CAdd)
    6845           2 :     return SDValue();
    6846             : 
    6847             :   // If the resulting offset is too large, we can't fold it into the addressing
    6848             :   // mode offset.
    6849          88 :   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    6850          44 :   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
    6851             : 
    6852          44 :   AddrMode AM;
    6853          44 :   AM.HasBaseReg = true;
    6854          44 :   AM.BaseOffs = Offset.getSExtValue();
    6855          44 :   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
    6856          14 :     return SDValue();
    6857             : 
    6858          30 :   SelectionDAG &DAG = DCI.DAG;
    6859             :   SDLoc SL(N);
    6860          60 :   EVT VT = N->getValueType(0);
    6861             : 
    6862          30 :   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
    6863          30 :   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
    6864             : 
    6865             :   SDNodeFlags Flags;
    6866          30 :   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
    6867           0 :                           (N0.getOpcode() == ISD::OR ||
    6868           0 :                            N0->getFlags().hasNoUnsignedWrap()));
    6869             : 
    6870          30 :   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
    6871             : }
    6872             : 
    6873      341305 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
    6874             :                                                   DAGCombinerInfo &DCI) const {
    6875      341305 :   SDValue Ptr = N->getBasePtr();
    6876      341305 :   SelectionDAG &DAG = DCI.DAG;
    6877             :   SDLoc SL(N);
    6878             : 
    6879             :   // TODO: We could also do this for multiplies.
    6880      341305 :   if (Ptr.getOpcode() == ISD::SHL) {
    6881             :     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
    6882         213 :                                           N->getMemoryVT(), DCI);
    6883         213 :     if (NewPtr) {
    6884          30 :       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
    6885             : 
    6886          42 :       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
    6887          30 :       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
    6888             :     }
    6889             :   }
    6890             : 
    6891      341275 :   return SDValue();
    6892             : }
    6893             : 
    6894             : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
    6895        2957 :   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
    6896        3555 :          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
    6897        2021 :          (Opc == ISD::XOR && Val == 0);
    6898             : }
    6899             : 
    6900             : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
    6901             : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
    6902             : // integer combine opportunities since most 64-bit operations are decomposed
    6903             : // this way.  TODO: We won't want this for SALU especially if it is an inline
    6904             : // immediate.
    6905        2001 : SDValue SITargetLowering::splitBinaryBitConstantOp(
    6906             :   DAGCombinerInfo &DCI,
    6907             :   const SDLoc &SL,
    6908             :   unsigned Opc, SDValue LHS,
    6909             :   const ConstantSDNode *CRHS) const {
    6910        2001 :   uint64_t Val = CRHS->getZExtValue();
    6911             :   uint32_t ValLo = Lo_32(Val);
    6912             :   uint32_t ValHi = Hi_32(Val);
    6913        2001 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6914             : 
    6915             :     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
    6916             :          bitOpWithConstantIsReducible(Opc, ValHi)) ||
    6917         378 :         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
    6918             :     // If we need to materialize a 64-bit immediate, it will be split up later
    6919             :     // anyway. Avoid creating the harder to understand 64-bit immediate
    6920             :     // materialization.
    6921        1630 :     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
    6922             :   }
    6923             : 
    6924         371 :   return SDValue();
    6925             : }
    6926             : 
    6927             : // Returns true if argument is a boolean value which is not serialized into
    6928             : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
    6929           0 : static bool isBoolSGPR(SDValue V) {
    6930           0 :   if (V.getValueType() != MVT::i1)
    6931           0 :     return false;
    6932         240 :   switch (V.getOpcode()) {
    6933             :   default: break;
    6934             :   case ISD::SETCC:
    6935             :   case ISD::AND:
    6936             :   case ISD::OR:
    6937             :   case ISD::XOR:
    6938             :   case AMDGPUISD::FP_CLASS:
    6939             :     return true;
    6940             :   }
    6941           0 :   return false;
    6942             : }
    6943             : 
    6944             : // If a constant has all zeroes or all ones within each byte return it.
    6945             : // Otherwise return 0.
    6946         318 : static uint32_t getConstantPermuteMask(uint32_t C) {
    6947             :   // 0xff for any zero byte in the mask
    6948             :   uint32_t ZeroByteMask = 0;
    6949         318 :   if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
    6950         318 :   if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
    6951         318 :   if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
    6952         318 :   if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
    6953         318 :   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
    6954         318 :   if ((NonZeroByteMask & C) != NonZeroByteMask)
    6955          12 :     return 0; // Partial bytes selected.
    6956             :   return C;
    6957             : }
    6958             : 
    6959             : // Check if a node selects whole bytes from its operand 0 starting at a byte
    6960             : // boundary while masking the rest. Returns select mask as in the v_perm_b32
    6961             : // or -1 if not succeeded.
    6962             : // Note byte select encoding:
    6963             : // value 0-3 selects corresponding source byte;
    6964             : // value 0xc selects zero;
    6965             : // value 0xff selects 0xff.
    6966           0 : static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
    6967             :   assert(V.getValueSizeInBits() == 32);
    6968             : 
    6969           0 :   if (V.getNumOperands() != 2)
    6970           0 :     return ~0;
    6971             : 
    6972             :   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
    6973             :   if (!N1)
    6974           0 :     return ~0;
    6975             : 
    6976           0 :   uint32_t C = N1->getZExtValue();
    6977             : 
    6978           0 :   switch (V.getOpcode()) {
    6979             :   default:
    6980             :     break;
    6981           0 :   case ISD::AND:
    6982           0 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6983           0 :       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
    6984           0 :     }
    6985             :     break;
    6986             : 
    6987           0 :   case ISD::OR:
    6988           0 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6989           0 :       return (0x03020100 & ~ConstMask) | ConstMask;
    6990           0 :     }
    6991             :     break;
    6992             : 
    6993           0 :   case ISD::SHL:
    6994           0 :     if (C % 8)
    6995           0 :       return ~0;
    6996             : 
    6997           0 :     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
    6998             : 
    6999           0 :   case ISD::SRL:
    7000           0 :     if (C % 8)
    7001           0 :       return ~0;
    7002             : 
    7003           0 :     return uint32_t(0x0c0c0c0c03020100ull >> C);
    7004             :   }
    7005             : 
    7006             :   return ~0;
    7007             : }
    7008             : 
    7009       36547 : SDValue SITargetLowering::performAndCombine(SDNode *N,
    7010             :                                             DAGCombinerInfo &DCI) const {
    7011       36547 :   if (DCI.isBeforeLegalize())
    7012        1407 :     return SDValue();
    7013             : 
    7014       35140 :   SelectionDAG &DAG = DCI.DAG;
    7015       35140 :   EVT VT = N->getValueType(0);
    7016       35140 :   SDValue LHS = N->getOperand(0);
    7017       35140 :   SDValue RHS = N->getOperand(1);
    7018             : 
    7019             : 
    7020             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    7021        1823 :   if (VT == MVT::i64 && CRHS) {
    7022        1611 :     if (SDValue Split
    7023        1611 :         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
    7024        1527 :       return Split;
    7025             :   }
    7026             : 
    7027       33613 :   if (CRHS && VT == MVT::i32) {
    7028             :     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
    7029             :     // nb = number of trailing zeroes in mask
    7030             :     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
    7031             :     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
    7032       29468 :     uint64_t Mask = CRHS->getZExtValue();
    7033             :     unsigned Bits = countPopulation(Mask);
    7034       41674 :     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
    7035       34912 :         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
    7036          54 :       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
    7037          54 :         unsigned Shift = CShift->getZExtValue();
    7038          54 :         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
    7039          54 :         unsigned Offset = NB + Shift;
    7040          54 :         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
    7041             :           SDLoc SL(N);
    7042             :           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    7043          54 :                                     LHS->getOperand(0),
    7044             :                                     DAG.getConstant(Offset, SL, MVT::i32),
    7045          54 :                                     DAG.getConstant(Bits, SL, MVT::i32));
    7046          54 :           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
    7047             :           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
    7048          54 :                                     DAG.getValueType(NarrowVT));
    7049          54 :           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
    7050         108 :                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
    7051          54 :           return Shl;
    7052             :         }
    7053             :       }
    7054             :     }
    7055             : 
    7056             :     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    7057       29414 :     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
    7058             :         isa<ConstantSDNode>(LHS.getOperand(2))) {
    7059           2 :       uint32_t Sel = getConstantPermuteMask(Mask);
    7060           2 :       if (!Sel)
    7061           0 :         return SDValue();
    7062             : 
    7063             :       // Select 0xc for all zero bytes
    7064           2 :       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
    7065             :       SDLoc DL(N);
    7066             :       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    7067           2 :                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    7068             :     }
    7069             :   }
    7070             : 
    7071             :   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
    7072             :   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
    7073       33557 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
    7074         325 :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    7075         325 :     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
    7076             : 
    7077         325 :     SDValue X = LHS.getOperand(0);
    7078         325 :     SDValue Y = RHS.getOperand(0);
    7079         325 :     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
    7080         321 :       return SDValue();
    7081             : 
    7082           4 :     if (LCC == ISD::SETO) {
    7083             :       if (X != LHS.getOperand(1))
    7084           0 :         return SDValue();
    7085             : 
    7086           2 :       if (RCC == ISD::SETUNE) {
    7087             :         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
    7088           4 :         if (!C1 || !C1->isInfinity() || C1->isNegative())
    7089           0 :           return SDValue();
    7090             : 
    7091             :         const uint32_t Mask = SIInstrFlags::N_NORMAL |
    7092             :                               SIInstrFlags::N_SUBNORMAL |
    7093             :                               SIInstrFlags::N_ZERO |
    7094             :                               SIInstrFlags::P_ZERO |
    7095             :                               SIInstrFlags::P_SUBNORMAL |
    7096             :                               SIInstrFlags::P_NORMAL;
    7097             : 
    7098             :         static_assert(((~(SIInstrFlags::S_NAN |
    7099             :                           SIInstrFlags::Q_NAN |
    7100             :                           SIInstrFlags::N_INFINITY |
    7101             :                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
    7102             :                       "mask not equal");
    7103             : 
    7104             :         SDLoc DL(N);
    7105             :         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    7106           2 :                            X, DAG.getConstant(Mask, DL, MVT::i32));
    7107             :       }
    7108             :     }
    7109             :   }
    7110             : 
    7111       33234 :   if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
    7112             :     std::swap(LHS, RHS);
    7113             : 
    7114       33243 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    7115             :       RHS.hasOneUse()) {
    7116           8 :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    7117             :     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
    7118             :     // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
    7119             :     const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    7120           8 :     if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
    7121           8 :         (RHS.getOperand(0) == LHS.getOperand(0) &&
    7122           8 :          LHS.getOperand(0) == LHS.getOperand(1))) {
    7123             :       const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
    7124             :       unsigned NewMask = LCC == ISD::SETO ?
    7125           5 :         Mask->getZExtValue() & ~OrdMask :
    7126          12 :         Mask->getZExtValue() & OrdMask;
    7127             : 
    7128             :       SDLoc DL(N);
    7129             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
    7130           6 :                          DAG.getConstant(NewMask, DL, MVT::i32));
    7131             :     }
    7132             :   }
    7133             : 
    7134       29620 :   if (VT == MVT::i32 &&
    7135       29619 :       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
    7136             :     // and x, (sext cc from i1) => select cc, x, 0
    7137          24 :     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
    7138             :       std::swap(LHS, RHS);
    7139          24 :     if (isBoolSGPR(RHS.getOperand(0)))
    7140          16 :       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
    7141          48 :                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
    7142             :   }
    7143             : 
    7144             :   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    7145       33212 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7146       47166 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    7147        1837 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    7148         324 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    7149         324 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    7150         324 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    7151             :       // Canonicalize the expression in an attempt to have fewer unique masks
    7152             :       // and therefore fewer registers used to hold the masks.
    7153           1 :       if (LHSMask > RHSMask) {
    7154             :         std::swap(LHSMask, RHSMask);
    7155             :         std::swap(LHS, RHS);
    7156             :       }
    7157             : 
    7158             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    7159             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    7160           1 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    7161           1 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    7162             : 
    7163             :       // Check of we need to combine values from two sources within a byte.
    7164           1 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    7165             :           // If we select high and lower word keep it for SDWA.
    7166             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    7167           1 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    7168             :         // Each byte in each mask is either selector mask 0-3, or has higher
    7169             :         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
    7170             :         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
    7171             :         // mask which is not 0xff wins. By anding both masks we have a correct
    7172             :         // result except that 0x0c shall be corrected to give 0x0c only.
    7173           1 :         uint32_t Mask = LHSMask & RHSMask;
    7174           5 :         for (unsigned I = 0; I < 32; I += 8) {
    7175           4 :           uint32_t ByteSel = 0xff << I;
    7176           4 :           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
    7177           0 :             Mask &= (0x0c << I) & 0xffffffff;
    7178             :         }
    7179             : 
    7180             :         // Add 4 to each active LHS lane. It will not affect any existing 0xff
    7181             :         // or 0x0c.
    7182           1 :         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
    7183             :         SDLoc DL(N);
    7184             : 
    7185             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    7186             :                            LHS.getOperand(0), RHS.getOperand(0),
    7187           1 :                            DAG.getConstant(Sel, DL, MVT::i32));
    7188             :       }
    7189             :     }
    7190             :   }
    7191             : 
    7192       33211 :   return SDValue();
    7193             : }
    7194             : 
    7195       15730 : SDValue SITargetLowering::performOrCombine(SDNode *N,
    7196             :                                            DAGCombinerInfo &DCI) const {
    7197       15730 :   SelectionDAG &DAG = DCI.DAG;
    7198       15730 :   SDValue LHS = N->getOperand(0);
    7199       15730 :   SDValue RHS = N->getOperand(1);
    7200             : 
    7201       15730 :   EVT VT = N->getValueType(0);
    7202             :   if (VT == MVT::i1) {
    7203             :     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
    7204         125 :     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    7205             :         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
    7206          14 :       SDValue Src = LHS.getOperand(0);
    7207             :       if (Src != RHS.getOperand(0))
    7208           1 :         return SDValue();
    7209             : 
    7210             :       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    7211             :       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    7212          13 :       if (!CLHS || !CRHS)
    7213           0 :         return SDValue();
    7214             : 
    7215             :       // Only 10 bits are used.
    7216             :       static const uint32_t MaxMask = 0x3ff;
    7217             : 
    7218          39 :       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
    7219             :       SDLoc DL(N);
    7220             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    7221          13 :                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
    7222             :     }
    7223             : 
    7224         111 :     return SDValue();
    7225             :   }
    7226             : 
    7227             :   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    7228        5010 :   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
    7229             :       LHS.getOpcode() == AMDGPUISD::PERM &&
    7230             :       isa<ConstantSDNode>(LHS.getOperand(2))) {
    7231           3 :     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
    7232           3 :     if (!Sel)
    7233           0 :       return SDValue();
    7234             : 
    7235           3 :     Sel |= LHS.getConstantOperandVal(2);
    7236             :     SDLoc DL(N);
    7237             :     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    7238           3 :                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    7239             :   }
    7240             : 
    7241             :   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    7242       15602 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7243       21065 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    7244        7749 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    7245        1087 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    7246        1087 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    7247        1087 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    7248             :       // Canonicalize the expression in an attempt to have fewer unique masks
    7249             :       // and therefore fewer registers used to hold the masks.
    7250         141 :       if (LHSMask > RHSMask) {
    7251             :         std::swap(LHSMask, RHSMask);
    7252             :         std::swap(LHS, RHS);
    7253             :       }
    7254             : 
    7255             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    7256             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    7257         141 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    7258         141 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    7259             : 
    7260             :       // Check of we need to combine values from two sources within a byte.
    7261         141 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    7262             :           // If we select high and lower word keep it for SDWA.
    7263             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    7264         141 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    7265             :         // Kill zero bytes selected by other mask. Zero value is 0xc.
    7266          12 :         LHSMask &= ~RHSUsedLanes;
    7267          12 :         RHSMask &= ~LHSUsedLanes;
    7268             :         // Add 4 to each active LHS lane
    7269          12 :         LHSMask |= LHSUsedLanes & 0x04040404;
    7270             :         // Combine masks
    7271          12 :         uint32_t Sel = LHSMask | RHSMask;
    7272             :         SDLoc DL(N);
    7273             : 
    7274             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    7275             :                            LHS.getOperand(0), RHS.getOperand(0),
    7276          12 :                            DAG.getConstant(Sel, DL, MVT::i32));
    7277             :       }
    7278             :     }
    7279             :   }
    7280             : 
    7281             :   if (VT != MVT::i64)
    7282       13382 :     return SDValue();
    7283             : 
    7284             :   // TODO: This could be a generic combine with a predicate for extracting the
    7285             :   // high half of an integer being free.
    7286             : 
    7287             :   // (or i64:x, (zero_extend i32:y)) ->
    7288             :   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
    7289        2208 :   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
    7290             :       RHS.getOpcode() != ISD::ZERO_EXTEND)
    7291             :     std::swap(LHS, RHS);
    7292             : 
    7293        2208 :   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
    7294        1277 :     SDValue ExtSrc = RHS.getOperand(0);
    7295             :     EVT SrcVT = ExtSrc.getValueType();
    7296             :     if (SrcVT == MVT::i32) {
    7297             :       SDLoc SL(N);
    7298             :       SDValue LowLHS, HiBits;
    7299        1277 :       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
    7300        1277 :       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
    7301             : 
    7302        1277 :       DCI.AddToWorklist(LowOr.getNode());
    7303        1277 :       DCI.AddToWorklist(HiBits.getNode());
    7304             : 
    7305             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    7306        1277 :                                 LowOr, HiBits);
    7307        1277 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    7308             :     }
    7309             :   }
    7310             : 
    7311         931 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    7312             :   if (CRHS) {
    7313          81 :     if (SDValue Split
    7314          81 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
    7315          77 :       return Split;
    7316             :   }
    7317             : 
    7318         854 :   return SDValue();
    7319             : }
    7320             : 
    7321        1928 : SDValue SITargetLowering::performXorCombine(SDNode *N,
    7322             :                                             DAGCombinerInfo &DCI) const {
    7323        1928 :   EVT VT = N->getValueType(0);
    7324             :   if (VT != MVT::i64)
    7325        1331 :     return SDValue();
    7326             : 
    7327         597 :   SDValue LHS = N->getOperand(0);
    7328         597 :   SDValue RHS = N->getOperand(1);
    7329             : 
    7330             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    7331             :   if (CRHS) {
    7332         309 :     if (SDValue Split
    7333         309 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
    7334          26 :       return Split;
    7335             :   }
    7336             : 
    7337         571 :   return SDValue();
    7338             : }
    7339             : 
    7340             : // Instructions that will be lowered with a final instruction that zeros the
    7341             : // high result bits.
    7342             : // XXX - probably only need to list legal operations.
    7343         324 : static bool fp16SrcZerosHighBits(unsigned Opc) {
    7344         324 :   switch (Opc) {
    7345             :   case ISD::FADD:
    7346             :   case ISD::FSUB:
    7347             :   case ISD::FMUL:
    7348             :   case ISD::FDIV:
    7349             :   case ISD::FREM:
    7350             :   case ISD::FMA:
    7351             :   case ISD::FMAD:
    7352             :   case ISD::FCANONICALIZE:
    7353             :   case ISD::FP_ROUND:
    7354             :   case ISD::UINT_TO_FP:
    7355             :   case ISD::SINT_TO_FP:
    7356             :   case ISD::FABS:
    7357             :     // Fabs is lowered to a bit operation, but it's an and which will clear the
    7358             :     // high bits anyway.
    7359             :   case ISD::FSQRT:
    7360             :   case ISD::FSIN:
    7361             :   case ISD::FCOS:
    7362             :   case ISD::FPOWI:
    7363             :   case ISD::FPOW:
    7364             :   case ISD::FLOG:
    7365             :   case ISD::FLOG2:
    7366             :   case ISD::FLOG10:
    7367             :   case ISD::FEXP:
    7368             :   case ISD::FEXP2:
    7369             :   case ISD::FCEIL:
    7370             :   case ISD::FTRUNC:
    7371             :   case ISD::FRINT:
    7372             :   case ISD::FNEARBYINT:
    7373             :   case ISD::FROUND:
    7374             :   case ISD::FFLOOR:
    7375             :   case ISD::FMINNUM:
    7376             :   case ISD::FMAXNUM:
    7377             :   case AMDGPUISD::FRACT:
    7378             :   case AMDGPUISD::CLAMP:
    7379             :   case AMDGPUISD::COS_HW:
    7380             :   case AMDGPUISD::SIN_HW:
    7381             :   case AMDGPUISD::FMIN3:
    7382             :   case AMDGPUISD::FMAX3:
    7383             :   case AMDGPUISD::FMED3:
    7384             :   case AMDGPUISD::FMAD_FTZ:
    7385             :   case AMDGPUISD::RCP:
    7386             :   case AMDGPUISD::RSQ:
    7387             :   case AMDGPUISD::RCP_IFLAG:
    7388             :   case AMDGPUISD::LDEXP:
    7389             :     return true;
    7390          57 :   default:
    7391             :     // fcopysign, select and others may be lowered to 32-bit bit operations
    7392             :     // which don't zero the high bits.
    7393          57 :     return false;
    7394             :   }
    7395             : }
    7396             : 
    7397       21073 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
    7398             :                                                    DAGCombinerInfo &DCI) const {
    7399       21073 :   if (!Subtarget->has16BitInsts() ||
    7400       15844 :       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    7401       17571 :     return SDValue();
    7402             : 
    7403        7004 :   EVT VT = N->getValueType(0);
    7404             :   if (VT != MVT::i32)
    7405        1770 :     return SDValue();
    7406             : 
    7407        1732 :   SDValue Src = N->getOperand(0);
    7408             :   if (Src.getValueType() != MVT::i16)
    7409         225 :     return SDValue();
    7410             : 
    7411             :   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
    7412             :   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
    7413        1507 :   if (Src.getOpcode() == ISD::BITCAST) {
    7414         324 :     SDValue BCSrc = Src.getOperand(0);
    7415         324 :     if (BCSrc.getValueType() == MVT::f16 &&
    7416         324 :         fp16SrcZerosHighBits(BCSrc.getOpcode()))
    7417         534 :       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
    7418             :   }
    7419             : 
    7420        1240 :   return SDValue();
    7421             : }
    7422             : 
    7423         102 : SDValue SITargetLowering::performClassCombine(SDNode *N,
    7424             :                                               DAGCombinerInfo &DCI) const {
    7425         102 :   SelectionDAG &DAG = DCI.DAG;
    7426         102 :   SDValue Mask = N->getOperand(1);
    7427             : 
    7428             :   // fp_class x, 0 -> false
    7429             :   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
    7430         168 :     if (CMask->isNullValue())
    7431           4 :       return DAG.getConstant(0, SDLoc(N), MVT::i1);
    7432             :   }
    7433             : 
    7434         200 :   if (N->getOperand(0).isUndef())
    7435           2 :     return DAG.getUNDEF(MVT::i1);
    7436             : 
    7437          98 :   return SDValue();
    7438             : }
    7439             : 
    7440         763 : SDValue SITargetLowering::performRcpCombine(SDNode *N,
    7441             :                                             DAGCombinerInfo &DCI) const {
    7442         763 :   EVT VT = N->getValueType(0);
    7443         763 :   SDValue N0 = N->getOperand(0);
    7444             : 
    7445         763 :   if (N0.isUndef())
    7446           1 :     return N0;
    7447             : 
    7448         666 :   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
    7449             :                          N0.getOpcode() == ISD::SINT_TO_FP)) {
    7450         410 :     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
    7451         205 :                            N->getFlags());
    7452             :   }
    7453             : 
    7454         557 :   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
    7455             : }
    7456             : 
    7457        1366 : bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
    7458             :                                        unsigned MaxDepth) const {
    7459             :   unsigned Opcode = Op.getOpcode();
    7460        1366 :   if (Opcode == ISD::FCANONICALIZE)
    7461             :     return true;
    7462             : 
    7463             :   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
    7464          11 :     auto F = CFP->getValueAPF();
    7465          11 :     if (F.isNaN() && F.isSignaling())
    7466             :       return false;
    7467          11 :     return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
    7468             :   }
    7469             : 
    7470             :   // If source is a result of another standard FP operation it is already in
    7471             :   // canonical form.
    7472        1355 :   if (MaxDepth == 0)
    7473             :     return false;
    7474             : 
    7475        1355 :   switch (Opcode) {
    7476             :   // These will flush denorms if required.
    7477             :   case ISD::FADD:
    7478             :   case ISD::FSUB:
    7479             :   case ISD::FMUL:
    7480             :   case ISD::FCEIL:
    7481             :   case ISD::FFLOOR:
    7482             :   case ISD::FMA:
    7483             :   case ISD::FMAD:
    7484             :   case ISD::FSQRT:
    7485             :   case ISD::FDIV:
    7486             :   case ISD::FREM:
    7487             :   case ISD::FP_ROUND:
    7488             :   case ISD::FP_EXTEND:
    7489             :   case AMDGPUISD::FMUL_LEGACY:
    7490             :   case AMDGPUISD::FMAD_FTZ:
    7491             :   case AMDGPUISD::RCP:
    7492             :   case AMDGPUISD::RSQ:
    7493             :   case AMDGPUISD::RSQ_CLAMP:
    7494             :   case AMDGPUISD::RCP_LEGACY:
    7495             :   case AMDGPUISD::RSQ_LEGACY:
    7496             :   case AMDGPUISD::RCP_IFLAG:
    7497             :   case AMDGPUISD::TRIG_PREOP:
    7498             :   case AMDGPUISD::DIV_SCALE:
    7499             :   case AMDGPUISD::DIV_FMAS:
    7500             :   case AMDGPUISD::DIV_FIXUP:
    7501             :   case AMDGPUISD::FRACT:
    7502             :   case AMDGPUISD::LDEXP:
    7503             :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    7504             :   case AMDGPUISD::CVT_F32_UBYTE0:
    7505             :   case AMDGPUISD::CVT_F32_UBYTE1:
    7506             :   case AMDGPUISD::CVT_F32_UBYTE2:
    7507             :   case AMDGPUISD::CVT_F32_UBYTE3:
    7508             :     return true;
    7509             : 
    7510             :   // It can/will be lowered or combined as a bit operation.
    7511             :   // Need to check their input recursively to handle.
    7512         146 :   case ISD::FNEG:
    7513             :   case ISD::FABS:
    7514             :   case ISD::FCOPYSIGN:
    7515         292 :     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
    7516             : 
    7517             :   case ISD::FSIN:
    7518             :   case ISD::FCOS:
    7519             :   case ISD::FSINCOS:
    7520          30 :     return Op.getValueType().getScalarType() != MVT::f16;
    7521             : 
    7522          67 :   case ISD::FMINNUM:
    7523             :   case ISD::FMAXNUM:
    7524             :   case AMDGPUISD::CLAMP:
    7525             :   case AMDGPUISD::FMED3:
    7526             :   case AMDGPUISD::FMAX3:
    7527             :   case AMDGPUISD::FMIN3: {
    7528             :     // FIXME: Shouldn't treat the generic operations different based these.
    7529          67 :     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
    7530          67 :     if (IsIEEEMode) {
    7531             :       // snans will be quieted, so we only need to worry about denormals.
    7532          84 :       if (Subtarget->supportsMinMaxDenormModes() ||
    7533          34 :           denormalsEnabledForType(Op.getValueType()))
    7534          26 :         return true;
    7535             : 
    7536             :       // Flushing may be required.
    7537             :       // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
    7538             :       // targets need to check their input recursively.
    7539          52 :       return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
    7540           4 :              isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
    7541             :     }
    7542             : 
    7543          28 :     if (Subtarget->supportsMinMaxDenormModes() ||
    7544          11 :         denormalsEnabledForType(Op.getValueType())) {
    7545             :       // Only quieting may be necessary.
    7546           9 :       return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
    7547           3 :              DAG.isKnownNeverSNaN(Op.getOperand(1));
    7548             :     }
    7549             : 
    7550             :     // Flushing and quieting may be necessary
    7551             :     // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
    7552             :     // needs to be quieted.
    7553          16 :     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
    7554           0 :            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
    7555             :   }
    7556           5 :   case ISD::SELECT: {
    7557          15 :     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
    7558           5 :            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
    7559             :   }
    7560          63 :   case ISD::BUILD_VECTOR: {
    7561         110 :     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
    7562          94 :       SDValue SrcOp = Op.getOperand(i);
    7563          94 :       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
    7564          47 :         return false;
    7565             :     }
    7566             : 
    7567             :     return true;
    7568             :   }
    7569          72 :   case ISD::EXTRACT_VECTOR_ELT:
    7570             :   case ISD::EXTRACT_SUBVECTOR: {
    7571         144 :     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
    7572             :   }
    7573          86 :   case ISD::INSERT_VECTOR_ELT: {
    7574         188 :     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
    7575          16 :            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
    7576             :   }
    7577          38 :   case ISD::UNDEF:
    7578             :     // Could be anything.
    7579          38 :     return false;
    7580             : 
    7581          25 :   case ISD::INTRINSIC_WO_CHAIN: {
    7582             :     unsigned IntrinsicID
    7583          50 :       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    7584             :     // TODO: Handle more intrinsics
    7585             :     switch (IntrinsicID) {
    7586             :     case Intrinsic::amdgcn_cvt_pkrtz:
    7587             :     case Intrinsic::amdgcn_cubeid:
    7588             :     case Intrinsic::amdgcn_frexp_mant:
    7589             :     case Intrinsic::amdgcn_fdot2:
    7590             :       return true;
    7591             :     default:
    7592             :       break;
    7593             :     }
    7594             : 
    7595             :     LLVM_FALLTHROUGH;
    7596             :   }
    7597             :   default:
    7598         643 :     return denormalsEnabledForType(Op.getValueType()) &&
    7599         364 :            DAG.isKnownNeverSNaN(Op);
    7600             :   }
    7601             : 
    7602             :   llvm_unreachable("invalid operation");
    7603             : }
    7604             : 
    7605             : // Constant fold canonicalize.
    7606             : 
    7607         149 : SDValue SITargetLowering::getCanonicalConstantFP(
    7608             :   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
    7609             :   // Flush denormals to 0 if not enabled.
    7610         149 :   if (C.isDenormal() && !denormalsEnabledForType(VT))
    7611           4 :     return DAG.getConstantFP(0.0, SL, VT);
    7612             : 
    7613         145 :   if (C.isNaN()) {
    7614             :     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
    7615          55 :     if (C.isSignaling()) {
    7616             :       // Quiet a signaling NaN.
    7617             :       // FIXME: Is this supposed to preserve payload bits?
    7618          30 :       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
    7619             :     }
    7620             : 
    7621             :     // Make sure it is the canonical NaN bitpattern.
    7622             :     //
    7623             :     // TODO: Can we use -1 as the canonical NaN value since it's an inline
    7624             :     // immediate?
    7625          50 :     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
    7626          18 :       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
    7627             :   }
    7628             : 
    7629             :   // Already canonical.
    7630          97 :   return DAG.getConstantFP(C, SL, VT);
    7631             : }
    7632             : 
    7633           0 : static bool vectorEltWillFoldAway(SDValue Op) {
    7634          64 :   return Op.isUndef() || isa<ConstantFPSDNode>(Op);
    7635             : }
    7636             : 
    7637        1086 : SDValue SITargetLowering::performFCanonicalizeCombine(
    7638             :   SDNode *N,
    7639             :   DAGCombinerInfo &DCI) const {
    7640        1086 :   SelectionDAG &DAG = DCI.DAG;
    7641        1086 :   SDValue N0 = N->getOperand(0);
    7642        2172 :   EVT VT = N->getValueType(0);
    7643             : 
    7644             :   // fcanonicalize undef -> qnan
    7645        1086 :   if (N0.isUndef()) {
    7646          23 :     APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
    7647          46 :     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
    7648             :   }
    7649             : 
    7650        1063 :   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
    7651         137 :     EVT VT = N->getValueType(0);
    7652         274 :     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
    7653             :   }
    7654             : 
    7655             :   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
    7656             :   //                                                   (fcanonicalize k)
    7657             :   //
    7658             :   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
    7659             : 
    7660             :   // TODO: This could be better with wider vectors that will be split to v2f16,
    7661             :   // and to consider uses since there aren't that many packed operations.
    7662         926 :   if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
    7663             :       isTypeLegal(MVT::v2f16)) {
    7664             :     SDLoc SL(N);
    7665          39 :     SDValue NewElts[2];
    7666          39 :     SDValue Lo = N0.getOperand(0);
    7667          39 :     SDValue Hi = N0.getOperand(1);
    7668          39 :     EVT EltVT = Lo.getValueType();
    7669             : 
    7670             :     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
    7671          60 :       for (unsigned I = 0; I != 2; ++I) {
    7672          40 :         SDValue Op = N0.getOperand(I);
    7673             :         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
    7674          12 :           NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
    7675          24 :                                               CFP->getValueAPF());
    7676          28 :         } else if (Op.isUndef()) {
    7677             :           // Handled below based on what the other operand is.
    7678          18 :           NewElts[I] = Op;
    7679             :         } else {
    7680          10 :           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
    7681             :         }
    7682             :       }
    7683             : 
    7684             :       // If one half is undef, and one is constant, perfer a splat vector rather
    7685             :       // than the normal qNaN. If it's a register, prefer 0.0 since that's
    7686             :       // cheaper to use and may be free with a packed operation.
    7687          40 :       if (NewElts[0].isUndef()) {
    7688             :         if (isa<ConstantFPSDNode>(NewElts[1]))
    7689           4 :           NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
    7690           0 :             NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
    7691             :       }
    7692             : 
    7693          40 :       if (NewElts[1].isUndef()) {
    7694          10 :         NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
    7695           6 :           NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
    7696             :       }
    7697             : 
    7698          20 :       return DAG.getBuildVector(VT, SL, NewElts);
    7699             :     }
    7700             :   }
    7701             : 
    7702         906 :   return isCanonicalized(DAG, N0) ? N0 : SDValue();
    7703             : }
    7704             : 
    7705             : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
    7706          61 :   switch (Opc) {
    7707             :   case ISD::FMAXNUM:
    7708             :     return AMDGPUISD::FMAX3;
    7709           5 :   case ISD::SMAX:
    7710             :     return AMDGPUISD::SMAX3;
    7711           5 :   case ISD::UMAX:
    7712             :     return AMDGPUISD::UMAX3;
    7713          13 :   case ISD::FMINNUM:
    7714             :     return AMDGPUISD::FMIN3;
    7715          15 :   case ISD::SMIN:
    7716             :     return AMDGPUISD::SMIN3;
    7717           8 :   case ISD::UMIN:
    7718             :     return AMDGPUISD::UMIN3;
    7719           0 :   default:
    7720           0 :     llvm_unreachable("Not a min/max opcode");
    7721             :   }
    7722             : }
    7723             : 
    7724         153 : SDValue SITargetLowering::performIntMed3ImmCombine(
    7725             :   SelectionDAG &DAG, const SDLoc &SL,
    7726             :   SDValue Op0, SDValue Op1, bool Signed) const {
    7727             :   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
    7728             :   if (!K1)
    7729          90 :     return SDValue();
    7730             : 
    7731             :   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
    7732             :   if (!K0)
    7733           3 :     return SDValue();
    7734             : 
    7735          60 :   if (Signed) {
    7736         153 :     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
    7737           3 :       return SDValue();
    7738             :   } else {
    7739          27 :     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
    7740           3 :       return SDValue();
    7741             :   }
    7742             : 
    7743          54 :   EVT VT = K0->getValueType(0);
    7744          54 :   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
    7745          12 :   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
    7746             :     return DAG.getNode(Med3Opc, SL, VT,
    7747          50 :                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
    7748             :   }
    7749             : 
    7750             :   // If there isn't a 16-bit med3 operation, convert to 32-bit.
    7751             :   MVT NVT = MVT::i32;
    7752           4 :   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    7753             : 
    7754           4 :   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
    7755           8 :   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
    7756           4 :   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
    7757             : 
    7758           4 :   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
    7759           4 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
    7760             : }
    7761             : 
    7762           0 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
    7763             :   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
    7764           0 :     return C;
    7765             : 
    7766             :   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
    7767           0 :     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
    7768           0 :       return C;
    7769             :   }
    7770             : 
    7771             :   return nullptr;
    7772             : }
    7773             : 
    7774         431 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    7775             :                                                   const SDLoc &SL,
    7776             :                                                   SDValue Op0,
    7777             :                                                   SDValue Op1) const {
    7778         431 :   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
    7779         431 :   if (!K1)
    7780          70 :     return SDValue();
    7781             : 
    7782         361 :   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
    7783         361 :   if (!K0)
    7784           7 :     return SDValue();
    7785             : 
    7786             :   // Ordered >= (although NaN inputs should have folded away by now).
    7787        1062 :   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
    7788         354 :   if (Cmp == APFloat::cmpGreaterThan)
    7789           5 :     return SDValue();
    7790             : 
    7791             :   // TODO: Check IEEE bit enabled?
    7792         349 :   EVT VT = Op0.getValueType();
    7793         349 :   if (Subtarget->enableDX10Clamp()) {
    7794             :     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
    7795             :     // hardware fmed3 behavior converting to a min.
    7796             :     // FIXME: Should this be allowing -0.0?
    7797         340 :     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
    7798         257 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
    7799             :   }
    7800             : 
    7801             :   // med3 for f16 is only available on gfx9+, and not available for v2f16.
    7802           6 :   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
    7803             :     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    7804             :     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
    7805             :     // then give the other result, which is different from med3 with a NaN
    7806             :     // input.
    7807          87 :     SDValue Var = Op0.getOperand(0);
    7808          87 :     if (!DAG.isKnownNeverSNaN(Var))
    7809          16 :       return SDValue();
    7810             : 
    7811          71 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7812             : 
    7813          68 :     if ((!K0->hasOneUse() ||
    7814         207 :          TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
    7815          56 :         (!K1->hasOneUse() ||
    7816         239 :          TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
    7817             :       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
    7818         130 :                          Var, SDValue(K0, 0), SDValue(K1, 0));
    7819             :     }
    7820             :   }
    7821             : 
    7822          11 :   return SDValue();
    7823             : }
    7824             : 
    7825        3554 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    7826             :                                                DAGCombinerInfo &DCI) const {
    7827        3554 :   SelectionDAG &DAG = DCI.DAG;
    7828             : 
    7829        3554 :   EVT VT = N->getValueType(0);
    7830        3554 :   unsigned Opc = N->getOpcode();
    7831        3554 :   SDValue Op0 = N->getOperand(0);
    7832        3554 :   SDValue Op1 = N->getOperand(1);
    7833             : 
    7834             :   // Only do this if the inner op has one use since this will just increases
    7835             :   // register pressure for no benefit.
    7836             : 
    7837             : 
    7838        3466 :   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
    7839        3554 :       !VT.isVector() && VT != MVT::f64 &&
    7840         712 :       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
    7841             :     // max(max(a, b), c) -> max3(a, b, c)
    7842             :     // min(min(a, b), c) -> min3(a, b, c)
    7843        2923 :     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
    7844             :       SDLoc DL(N);
    7845             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    7846             :                          DL,
    7847             :                          N->getValueType(0),
    7848             :                          Op0.getOperand(0),
    7849             :                          Op0.getOperand(1),
    7850          72 :                          Op1);
    7851             :     }
    7852             : 
    7853             :     // Try commuted.
    7854             :     // max(a, max(b, c)) -> max3(a, b, c)
    7855             :     // min(a, min(b, c)) -> min3(a, b, c)
    7856        2858 :     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
    7857             :       SDLoc DL(N);
    7858             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    7859             :                          DL,
    7860             :                          N->getValueType(0),
    7861             :                          Op0,
    7862             :                          Op1.getOperand(0),
    7863          35 :                          Op1.getOperand(1));
    7864             :     }
    7865             :   }
    7866             : 
    7867             :   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
    7868        3592 :   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
    7869          93 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
    7870          48 :       return Med3;
    7871             :   }
    7872             : 
    7873        3511 :   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
    7874          60 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
    7875           6 :       return Med3;
    7876             :   }
    7877             : 
    7878             :   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
    7879        3439 :   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
    7880          40 :        (Opc == AMDGPUISD::FMIN_LEGACY &&
    7881             :         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
    7882             :       (VT == MVT::f32 || VT == MVT::f64 ||
    7883          73 :        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
    7884        3904 :        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
    7885             :       Op0.hasOneUse()) {
    7886         431 :     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
    7887         322 :       return Res;
    7888             :   }
    7889             : 
    7890        3117 :   return SDValue();
    7891             : }
    7892             : 
    7893           0 : static bool isClampZeroToOne(SDValue A, SDValue B) {
    7894             :   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
    7895             :     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
    7896             :       // FIXME: Should this be allowing -0.0?
    7897           0 :       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
    7898           0 :              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
    7899             :     }
    7900             :   }
    7901             : 
    7902             :   return false;
    7903             : }
    7904             : 
    7905             : // FIXME: Should only worry about snans for version with chain.
    7906         177 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
    7907             :                                               DAGCombinerInfo &DCI) const {
    7908         177 :   EVT VT = N->getValueType(0);
    7909             :   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
    7910             :   // NaNs. With a NaN input, the order of the operands may change the result.
    7911             : 
    7912         177 :   SelectionDAG &DAG = DCI.DAG;
    7913             :   SDLoc SL(N);
    7914             : 
    7915         177 :   SDValue Src0 = N->getOperand(0);
    7916         177 :   SDValue Src1 = N->getOperand(1);
    7917         177 :   SDValue Src2 = N->getOperand(2);
    7918             : 
    7919         177 :   if (isClampZeroToOne(Src0, Src1)) {
    7920             :     // const_a, const_b, x -> clamp is safe in all cases including signaling
    7921             :     // nans.
    7922             :     // FIXME: Should this be allowing -0.0?
    7923          36 :     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
    7924             :   }
    7925             : 
    7926             :   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
    7927             :   // handling no dx10-clamp?
    7928         141 :   if (Subtarget->enableDX10Clamp()) {
    7929             :     // If NaNs is clamped to 0, we are free to reorder the inputs.
    7930             : 
    7931             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7932             :       std::swap(Src0, Src1);
    7933             : 
    7934             :     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
    7935             :       std::swap(Src1, Src2);
    7936             : 
    7937             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7938             :       std::swap(Src0, Src1);
    7939             : 
    7940         123 :     if (isClampZeroToOne(Src1, Src2))
    7941          12 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
    7942             :   }
    7943             : 
    7944         129 :   return SDValue();
    7945             : }
    7946             : 
    7947         113 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    7948             :                                                  DAGCombinerInfo &DCI) const {
    7949         113 :   SDValue Src0 = N->getOperand(0);
    7950         113 :   SDValue Src1 = N->getOperand(1);
    7951         113 :   if (Src0.isUndef() && Src1.isUndef())
    7952           6 :     return DCI.DAG.getUNDEF(N->getValueType(0));
    7953         110 :   return SDValue();
    7954             : }
    7955             : 
    7956      273398 : SDValue SITargetLowering::performExtractVectorEltCombine(
    7957             :   SDNode *N, DAGCombinerInfo &DCI) const {
    7958      273398 :   SDValue Vec = N->getOperand(0);
    7959      273398 :   SelectionDAG &DAG = DCI.DAG;
    7960             : 
    7961      273398 :   EVT VecVT = Vec.getValueType();
    7962      273398 :   EVT EltVT = VecVT.getVectorElementType();
    7963             : 
    7964      273326 :   if ((Vec.getOpcode() == ISD::FNEG ||
    7965      273436 :        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
    7966             :     SDLoc SL(N);
    7967          78 :     EVT EltVT = N->getValueType(0);
    7968          78 :     SDValue Idx = N->getOperand(1);
    7969             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7970          78 :                               Vec.getOperand(0), Idx);
    7971          78 :     return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
    7972             :   }
    7973             : 
    7974             :   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
    7975             :   //    =>
    7976             :   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
    7977             :   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
    7978             :   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
    7979      273320 :   if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
    7980             :     SDLoc SL(N);
    7981        1231 :     EVT EltVT = N->getValueType(0);
    7982        1231 :     SDValue Idx = N->getOperand(1);
    7983             :     unsigned Opc = Vec.getOpcode();
    7984             : 
    7985        1231 :     switch(Opc) {
    7986        1189 :     default:
    7987        1189 :       return SDValue();
    7988             :       // TODO: Support other binary operations.
    7989             :     case ISD::FADD:
    7990             :     case ISD::FSUB:
    7991             :     case ISD::FMUL:
    7992             :     case ISD::ADD:
    7993             :     case ISD::UMIN:
    7994             :     case ISD::UMAX:
    7995             :     case ISD::SMIN:
    7996             :     case ISD::SMAX:
    7997             :     case ISD::FMAXNUM:
    7998             :     case ISD::FMINNUM: {
    7999             :       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    8000          42 :                                  Vec.getOperand(0), Idx);
    8001             :       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    8002          42 :                                  Vec.getOperand(1), Idx);
    8003             : 
    8004          42 :       DCI.AddToWorklist(Elt0.getNode());
    8005          42 :       DCI.AddToWorklist(Elt1.getNode());
    8006          42 :       return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
    8007             :     }
    8008             :     }
    8009             :   }
    8010             : 
    8011      272089 :   if (!DCI.isBeforeLegalize())
    8012      228613 :     return SDValue();
    8013             : 
    8014       43476 :   unsigned VecSize = VecVT.getSizeInBits();
    8015       43476 :   unsigned EltSize = EltVT.getSizeInBits();
    8016             : 
    8017             :   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
    8018             :   // elements. This exposes more load reduction opportunities by replacing
    8019             :   // multiple small extract_vector_elements with a single 32-bit extract.
    8020       43476 :   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
    8021       29633 :   if (isa<MemSDNode>(Vec) &&
    8022        2065 :       EltSize <= 16 &&
    8023        1425 :       EltVT.isByteSized() &&
    8024         326 :       VecSize > 32 &&
    8025       43802 :       VecSize % 32 == 0 &&
    8026             :       Idx) {
    8027         302 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
    8028             : 
    8029         302 :     unsigned BitIndex = Idx->getZExtValue() * EltSize;
    8030         302 :     unsigned EltIdx = BitIndex / 32;
    8031         302 :     unsigned LeftoverBitIdx = BitIndex % 32;
    8032             :     SDLoc SL(N);
    8033             : 
    8034         302 :     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
    8035         302 :     DCI.AddToWorklist(Cast.getNode());
    8036             : 
    8037             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
    8038         302 :                               DAG.getConstant(EltIdx, SL, MVT::i32));
    8039         302 :     DCI.AddToWorklist(Elt.getNode());
    8040             :     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
    8041         302 :                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
    8042         302 :     DCI.AddToWorklist(Srl.getNode());
    8043             : 
    8044         302 :     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
    8045         302 :     DCI.AddToWorklist(Trunc.getNode());
    8046         302 :     return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
    8047             :   }
    8048             : 
    8049       43174 :   return SDValue();
    8050             : }
    8051             : 
    8052        3820 : static bool convertBuildVectorCastElt(SelectionDAG &DAG,
    8053             :                                       SDValue &Lo, SDValue &Hi) {
    8054        3820 :   if (Hi.getOpcode() == ISD::BITCAST &&
    8055        3820 :       Hi.getOperand(0).getValueType() == MVT::f16 &&
    8056           4 :       (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
    8057           2 :     Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
    8058           2 :     Hi = Hi.getOperand(0);
    8059           2 :     return true;
    8060             :   }
    8061             : 
    8062             :   return false;
    8063             : }
    8064             : 
    8065      139608 : SDValue SITargetLowering::performBuildVectorCombine(
    8066             :   SDNode *N, DAGCombinerInfo &DCI) const {
    8067             :   SDLoc SL(N);
    8068             : 
    8069             :   if (!isTypeLegal(MVT::v2i16))
    8070       60105 :     return SDValue();
    8071       79503 :   SelectionDAG &DAG = DCI.DAG;
    8072      159006 :   EVT VT = N->getValueType(0);
    8073             : 
    8074             :   if (VT == MVT::v2i16) {
    8075        1911 :     SDValue Lo = N->getOperand(0);
    8076        1911 :     SDValue Hi = N->getOperand(1);
    8077             : 
    8078             :     // v2i16 build_vector (const|undef), (bitcast f16:$x)
    8079             :     // -> bitcast (v2f16 build_vector const|undef, $x
    8080        1911 :     if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
    8081           4 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
    8082           2 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    8083             :     }
    8084             : 
    8085        1909 :     if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
    8086           0 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
    8087           0 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    8088             :     }
    8089             :   }
    8090             : 
    8091       79501 :   return SDValue();
    8092             : }
    8093             : 
    8094         200 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
    8095             :                                           const SDNode *N0,
    8096             :                                           const SDNode *N1) const {
    8097         400 :   EVT VT = N0->getValueType(0);
    8098             : 
    8099             :   // Only do this if we are not trying to support denormals. v_mad_f32 does not
    8100             :   // support denormals ever.
    8101         122 :   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
    8102          32 :       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
    8103         114 :     return ISD::FMAD;
    8104             : 
    8105          86 :   const TargetOptions &Options = DAG.getTarget().Options;
    8106          45 :   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    8107          45 :        (N0->getFlags().hasAllowContract() &&
    8108         136 :         N1->getFlags().hasAllowContract())) &&
    8109          44 :       isFMAFasterThanFMulAndFAdd(VT)) {
    8110          28 :     return ISD::FMA;
    8111             :   }
    8112             : 
    8113             :   return 0;
    8114             : }
    8115             : 
    8116          24 : static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
    8117             :                            EVT VT,
    8118             :                            SDValue N0, SDValue N1, SDValue N2,
    8119             :                            bool Signed) {
    8120          24 :   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
    8121          24 :   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
    8122          24 :   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
    8123          24 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
    8124             : }
    8125             : 
    8126      120080 : SDValue SITargetLowering::performAddCombine(SDNode *N,
    8127             :                                             DAGCombinerInfo &DCI) const {
    8128      120080 :   SelectionDAG &DAG = DCI.DAG;
    8129      240160 :   EVT VT = N->getValueType(0);
    8130             :   SDLoc SL(N);
    8131      120080 :   SDValue LHS = N->getOperand(0);
    8132      120080 :   SDValue RHS = N->getOperand(1);
    8133             : 
    8134      119537 :   if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
    8135        4613 :       && Subtarget->hasMad64_32() &&
    8136      121438 :       !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
    8137             :       VT.getScalarSizeInBits() <= 64) {
    8138          32 :     if (LHS.getOpcode() != ISD::MUL)
    8139             :       std::swap(LHS, RHS);
    8140             : 
    8141          32 :     SDValue MulLHS = LHS.getOperand(0);
    8142          32 :     SDValue MulRHS = LHS.getOperand(1);
    8143          32 :     SDValue AddRHS = RHS;
    8144             : 
    8145             :     // TODO: Maybe restrict if SGPR inputs.
    8146          50 :     if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
    8147          18 :         numBitsUnsigned(MulRHS, DAG) <= 32) {
    8148          17 :       MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
    8149          17 :       MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
    8150          17 :       AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
    8151          17 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
    8152             :     }
    8153             : 
    8154          15 :     if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
    8155           7 :       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
    8156           7 :       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
    8157           7 :       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
    8158           7 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
    8159             :     }
    8160             : 
    8161           8 :     return SDValue();
    8162             :   }
    8163             : 
    8164      120048 :   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
    8165      105557 :     return SDValue();
    8166             : 
    8167             :   // add x, zext (setcc) => addcarry x, 0, setcc
    8168             :   // add x, sext (setcc) => subcarry x, 0, setcc
    8169             :   unsigned Opc = LHS.getOpcode();
    8170       28982 :   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
    8171       14491 :       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
    8172             :     std::swap(RHS, LHS);
    8173             : 
    8174             :   Opc = RHS.getOpcode();
    8175       14491 :   switch (Opc) {
    8176             :   default: break;
    8177         153 :   case ISD::ZERO_EXTEND:
    8178             :   case ISD::SIGN_EXTEND:
    8179             :   case ISD::ANY_EXTEND: {
    8180         153 :     auto Cond = RHS.getOperand(0);
    8181             :     if (!isBoolSGPR(Cond))
    8182             :       break;
    8183          45 :     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
    8184          90 :     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
    8185          45 :     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
    8186          45 :     return DAG.getNode(Opc, SL, VTList, Args);
    8187             :   }
    8188           0 :   case ISD::ADDCARRY: {
    8189             :     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
    8190             :     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    8191           0 :     if (!C || C->getZExtValue() != 0) break;
    8192           0 :     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
    8193           0 :     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
    8194             :   }
    8195             :   }
    8196       14446 :   return SDValue();
    8197             : }
    8198             : 
    8199        3805 : SDValue SITargetLowering::performSubCombine(SDNode *N,
    8200             :                                             DAGCombinerInfo &DCI) const {
    8201        3805 :   SelectionDAG &DAG = DCI.DAG;
    8202        3805 :   EVT VT = N->getValueType(0);
    8203             : 
    8204             :   if (VT != MVT::i32)
    8205         624 :     return SDValue();
    8206             : 
    8207             :   SDLoc SL(N);
    8208        3181 :   SDValue LHS = N->getOperand(0);
    8209        3181 :   SDValue RHS = N->getOperand(1);
    8210             : 
    8211             :   unsigned Opc = LHS.getOpcode();
    8212        3181 :   if (Opc != ISD::SUBCARRY)
    8213             :     std::swap(RHS, LHS);
    8214             : 
    8215        3181 :   if (LHS.getOpcode() == ISD::SUBCARRY) {
    8216             :     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
    8217             :     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    8218           4 :     if (!C || C->getZExtValue() != 0)
    8219           0 :       return SDValue();
    8220           2 :     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
    8221           4 :     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
    8222             :   }
    8223        3179 :   return SDValue();
    8224             : }
    8225             : 
    8226         680 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
    8227             :   DAGCombinerInfo &DCI) const {
    8228             : 
    8229         680 :   if (N->getValueType(0) != MVT::i32)
    8230           0 :     return SDValue();
    8231             : 
    8232         680 :   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
    8233         514 :   if (!C || C->getZExtValue() != 0)
    8234         423 :     return SDValue();
    8235             : 
    8236         257 :   SelectionDAG &DAG = DCI.DAG;
    8237         257 :   SDValue LHS = N->getOperand(0);
    8238             : 
    8239             :   // addcarry (add x, y), 0, cc => addcarry x, y, cc
    8240             :   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
    8241             :   unsigned LHSOpc = LHS.getOpcode();
    8242         257 :   unsigned Opc = N->getOpcode();
    8243         257 :   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
    8244         257 :       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
    8245           2 :     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
    8246           4 :     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
    8247             :   }
    8248         255 :   return SDValue();
    8249             : }
    8250             : 
    8251        8302 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
    8252             :                                              DAGCombinerInfo &DCI) const {
    8253        8302 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    8254        5946 :     return SDValue();
    8255             : 
    8256        2356 :   SelectionDAG &DAG = DCI.DAG;
    8257        4712 :   EVT VT = N->getValueType(0);
    8258             : 
    8259             :   SDLoc SL(N);
    8260        2356 :   SDValue LHS = N->getOperand(0);
    8261        2356 :   SDValue RHS = N->getOperand(1);
    8262             : 
    8263             :   // These should really be instruction patterns, but writing patterns with
    8264             :   // source modiifiers is a pain.
    8265             : 
    8266             :   // fadd (fadd (a, a), b) -> mad 2.0, a, b
    8267        2356 :   if (LHS.getOpcode() == ISD::FADD) {
    8268         303 :     SDValue A = LHS.getOperand(0);
    8269         303 :     if (A == LHS.getOperand(1)) {
    8270         105 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    8271         105 :       if (FusedOp != 0) {
    8272          73 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    8273          73 :         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
    8274             :       }
    8275             :     }
    8276             :   }
    8277             : 
    8278             :   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
    8279        2283 :   if (RHS.getOpcode() == ISD::FADD) {
    8280         147 :     SDValue A = RHS.getOperand(0);
    8281         147 :     if (A == RHS.getOperand(1)) {
    8282          30 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    8283          30 :       if (FusedOp != 0) {
    8284          20 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    8285          20 :         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
    8286             :       }
    8287             :     }
    8288             :   }
    8289             : 
    8290        2263 :   return SDValue();
    8291             : }
    8292             : 
    8293        1634 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
    8294             :                                              DAGCombinerInfo &DCI) const {
    8295        1634 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    8296        1089 :     return SDValue();
    8297             : 
    8298         545 :   SelectionDAG &DAG = DCI.DAG;
    8299             :   SDLoc SL(N);
    8300         545 :   EVT VT = N->getValueType(0);
    8301             :   assert(!VT.isVector());
    8302             : 
    8303             :   // Try to get the fneg to fold into the source modifier. This undoes generic
    8304             :   // DAG combines and folds them into the mad.
    8305             :   //
    8306             :   // Only do this if we are not trying to support denormals. v_mad_f32 does
    8307             :   // not support denormals ever.
    8308         545 :   SDValue LHS = N->getOperand(0);
    8309         545 :   SDValue RHS = N->getOperand(1);
    8310         545 :   if (LHS.getOpcode() == ISD::FADD) {
    8311             :     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
    8312          46 :     SDValue A = LHS.getOperand(0);
    8313          46 :     if (A == LHS.getOperand(1)) {
    8314          24 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    8315          24 :       if (FusedOp != 0){
    8316          17 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    8317          17 :         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    8318             : 
    8319          17 :         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
    8320             :       }
    8321             :     }
    8322             :   }
    8323             : 
    8324         528 :   if (RHS.getOpcode() == ISD::FADD) {
    8325             :     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
    8326             : 
    8327          50 :     SDValue A = RHS.getOperand(0);
    8328          50 :     if (A == RHS.getOperand(1)) {
    8329          41 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    8330          41 :       if (FusedOp != 0){
    8331          32 :         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
    8332          32 :         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
    8333             :       }
    8334             :     }
    8335             :   }
    8336             : 
    8337         496 :   return SDValue();
    8338             : }
    8339             : 
    8340        1833 : SDValue SITargetLowering::performFMACombine(SDNode *N,
    8341             :                                             DAGCombinerInfo &DCI) const {
    8342        1833 :   SelectionDAG &DAG = DCI.DAG;
    8343        1833 :   EVT VT = N->getValueType(0);
    8344             :   SDLoc SL(N);
    8345             : 
    8346        1833 :   if (!Subtarget->hasDLInsts() || VT != MVT::f32)
    8347        1608 :     return SDValue();
    8348             : 
    8349             :   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
    8350             :   //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
    8351         225 :   SDValue Op1 = N->getOperand(0);
    8352         225 :   SDValue Op2 = N->getOperand(1);
    8353         225 :   SDValue FMA = N->getOperand(2);
    8354             : 
    8355          33 :   if (FMA.getOpcode() != ISD::FMA ||
    8356         258 :       Op1.getOpcode() != ISD::FP_EXTEND ||
    8357             :       Op2.getOpcode() != ISD::FP_EXTEND)
    8358         192 :     return SDValue();
    8359             : 
    8360             :   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
    8361             :   // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
    8362             :   // is sufficient to allow generaing fdot2.
    8363          33 :   const TargetOptions &Options = DAG.getTarget().Options;
    8364          33 :   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    8365           0 :       (N->getFlags().hasAllowContract() &&
    8366           0 :        FMA->getFlags().hasAllowContract())) {
    8367          33 :     Op1 = Op1.getOperand(0);
    8368          33 :     Op2 = Op2.getOperand(0);
    8369          33 :     if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
    8370             :         Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
    8371          12 :       return SDValue();
    8372             : 
    8373          21 :     SDValue Vec1 = Op1.getOperand(0);
    8374          21 :     SDValue Idx1 = Op1.getOperand(1);
    8375          21 :     SDValue Vec2 = Op2.getOperand(0);
    8376             : 
    8377          21 :     SDValue FMAOp1 = FMA.getOperand(0);
    8378          21 :     SDValue FMAOp2 = FMA.getOperand(1);
    8379          21 :     SDValue FMAAcc = FMA.getOperand(2);
    8380             : 
    8381          21 :     if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
    8382             :         FMAOp2.getOpcode() != ISD::FP_EXTEND)
    8383           0 :       return SDValue();
    8384             : 
    8385          21 :     FMAOp1 = FMAOp1.getOperand(0);
    8386          21 :     FMAOp2 = FMAOp2.getOperand(0);
    8387          21 :     if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
    8388             :         FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
    8389           0 :       return SDValue();
    8390             : 
    8391          21 :     SDValue Vec3 = FMAOp1.getOperand(0);
    8392          21 :     SDValue Vec4 = FMAOp2.getOperand(0);
    8393          21 :     SDValue Idx2 = FMAOp1.getOperand(1);
    8394             : 
    8395             :     if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
    8396             :         // Idx1 and Idx2 cannot be the same.
    8397             :         Idx1 == Idx2)
    8398          12 :       return SDValue();
    8399             : 
    8400             :     if (Vec1 == Vec2 || Vec3 == Vec4)
    8401           0 :       return SDValue();
    8402             : 
    8403             :     if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
    8404           3 :       return SDValue();
    8405             : 
    8406             :     if ((Vec1 == Vec3 && Vec2 == Vec4) ||
    8407             :         (Vec1 == Vec4 && Vec2 == Vec3)) {
    8408             :       return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
    8409          12 :                          DAG.getTargetConstant(0, SL, MVT::i1));
    8410             :     }
    8411             :   }
    8412           0 :   return SDValue();
    8413             : }
    8414             : 
    8415       10765 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
    8416             :                                               DAGCombinerInfo &DCI) const {
    8417       10765 :   SelectionDAG &DAG = DCI.DAG;
    8418             :   SDLoc SL(N);
    8419             : 
    8420       10765 :   SDValue LHS = N->getOperand(0);
    8421       10765 :   SDValue RHS = N->getOperand(1);
    8422             :   EVT VT = LHS.getValueType();
    8423       10765 :   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
    8424             : 
    8425             :   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
    8426             :   if (!CRHS) {
    8427             :     CRHS = dyn_cast<ConstantSDNode>(LHS);
    8428             :     if (CRHS) {
    8429             :       std::swap(LHS, RHS);
    8430           0 :       CC = getSetCCSwappedOperands(CC);
    8431             :     }
    8432             :   }
    8433             : 
    8434       10765 :   if (CRHS) {
    8435        4618 :     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
    8436          16 :         isBoolSGPR(LHS.getOperand(0))) {
    8437             :       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
    8438             :       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
    8439             :       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
    8440             :       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
    8441           3 :       if ((CRHS->isAllOnesValue() &&
    8442           3 :            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
    8443           0 :           (CRHS->isNullValue() &&
    8444           0 :            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
    8445             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    8446           3 :                            DAG.getConstant(-1, SL, MVT::i1));
    8447           0 :       if ((CRHS->isAllOnesValue() &&
    8448           0 :            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
    8449           0 :           (CRHS->isNullValue() &&
    8450           0 :            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
    8451           0 :         return LHS.getOperand(0);
    8452             :     }
    8453             : 
    8454        6005 :     uint64_t CRHSVal = CRHS->getZExtValue();
    8455        4389 :     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
    8456             :         LHS.getOpcode() == ISD::SELECT &&
    8457             :         isa<ConstantSDNode>(LHS.getOperand(1)) &&
    8458         162 :         isa<ConstantSDNode>(LHS.getOperand(2)) &&
    8459        6005 :         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
    8460         162 :         isBoolSGPR(LHS.getOperand(0))) {
    8461             :       // Given CT != FT:
    8462             :       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
    8463             :       // setcc (select cc, CT, CF), CF, ne => cc
    8464             :       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
    8465             :       // setcc (select cc, CT, CF), CT, eq => cc
    8466             :       uint64_t CT = LHS.getConstantOperandVal(1);
    8467             :       uint64_t CF = LHS.getConstantOperandVal(2);
    8468             : 
    8469         162 :       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
    8470           5 :           (CT == CRHSVal && CC == ISD::SETNE))
    8471             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    8472         158 :                            DAG.getConstant(-1, SL, MVT::i1));
    8473           4 :       if ((CF == CRHSVal && CC == ISD::SETNE) ||
    8474           3 :           (CT == CRHSVal && CC == ISD::SETEQ))
    8475           2 :         return LHS.getOperand(0);
    8476             :     }
    8477             :   }
    8478             : 
    8479        8818 :   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
    8480             :                                            VT != MVT::f16))
    8481        3612 :     return SDValue();
    8482             : 
    8483             :   // Match isinf/isfinite pattern
    8484             :   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
    8485             :   // (fcmp one (fabs x), inf) -> (fp_class x,
    8486             :   // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
    8487        6990 :   if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
    8488             :     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    8489             :     if (!CRHS)
    8490           0 :       return SDValue();
    8491             : 
    8492          10 :     const APFloat &APF = CRHS->getValueAPF();
    8493          10 :     if (APF.isInfinity() && !APF.isNegative()) {
    8494             :       const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
    8495             :                                  SIInstrFlags::N_INFINITY;
    8496             :       const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
    8497             :                                     SIInstrFlags::P_ZERO |
    8498             :                                     SIInstrFlags::N_NORMAL |
    8499             :                                     SIInstrFlags::P_NORMAL |
    8500             :                                     SIInstrFlags::N_SUBNORMAL |
    8501             :                                     SIInstrFlags::P_SUBNORMAL;
    8502          10 :       unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
    8503             :       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
    8504          10 :                          DAG.getConstant(Mask, SL, MVT::i32));
    8505             :     }
    8506             :   }
    8507             : 
    8508        6980 :   return SDValue();
    8509             : }
    8510             : 
    8511         357 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
    8512             :                                                      DAGCombinerInfo &DCI) const {
    8513         357 :   SelectionDAG &DAG = DCI.DAG;
    8514             :   SDLoc SL(N);
    8515         357 :   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
    8516             : 
    8517         357 :   SDValue Src = N->getOperand(0);
    8518         357 :   SDValue Srl = N->getOperand(0);
    8519         357 :   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
    8520          48 :     Srl = Srl.getOperand(0);
    8521             : 
    8522             :   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
    8523         357 :   if (Srl.getOpcode() == ISD::SRL) {
    8524             :     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
    8525             :     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
    8526             :     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
    8527             : 
    8528             :     if (const ConstantSDNode *C =
    8529             :         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
    8530          59 :       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
    8531          59 :                                EVT(MVT::i32));
    8532             : 
    8533          59 :       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
    8534          59 :       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
    8535          59 :         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
    8536          59 :                            MVT::f32, Srl);
    8537             :       }
    8538             :     }
    8539             :   }
    8540             : 
    8541         298 :   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
    8542             : 
    8543         298 :   KnownBits Known;
    8544         298 :   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    8545         596 :                                         !DCI.isBeforeLegalizeOps());
    8546         298 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    8547         596 :   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
    8548         298 :       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
    8549          93 :     DCI.CommitTargetLoweringOpt(TLO);
    8550             :   }
    8551             : 
    8552         298 :   return SDValue();
    8553             : }
    8554             : 
    8555         334 : SDValue SITargetLowering::performClampCombine(SDNode *N,
    8556             :                                               DAGCombinerInfo &DCI) const {
    8557         334 :   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
    8558             :   if (!CSrc)
    8559         309 :     return SDValue();
    8560             : 
    8561          25 :   const APFloat &F = CSrc->getValueAPF();
    8562          25 :   APFloat Zero = APFloat::getZero(F.getSemantics());
    8563          25 :   APFloat::cmpResult Cmp0 = F.compare(Zero);
    8564          25 :   if (Cmp0 == APFloat::cmpLessThan ||
    8565          12 :       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
    8566          18 :     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
    8567             :   }
    8568             : 
    8569          16 :   APFloat One(F.getSemantics(), "1.0");
    8570          16 :   APFloat::cmpResult Cmp1 = F.compare(One);
    8571          16 :   if (Cmp1 == APFloat::cmpGreaterThan)
    8572           6 :     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
    8573             : 
    8574          13 :   return SDValue(CSrc, 0);
    8575             : }
    8576             : 
    8577             : 
    8578     1666237 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    8579             :                                             DAGCombinerInfo &DCI) const {
    8580     3332474 :   switch (N->getOpcode()) {
    8581      378179 :   default:
    8582      378179 :     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    8583      120080 :   case ISD::ADD:
    8584      120080 :     return performAddCombine(N, DCI);
    8585        3805 :   case ISD::SUB:
    8586        3805 :     return performSubCombine(N, DCI);
    8587         680 :   case ISD::ADDCARRY:
    8588             :   case ISD::SUBCARRY:
    8589         680 :     return performAddCarrySubCarryCombine(N, DCI);
    8590        8302 :   case ISD::FADD:
    8591        8302 :     return performFAddCombine(N, DCI);
    8592        1634 :   case ISD::FSUB:
    8593        1634 :     return performFSubCombine(N, DCI);
    8594       10765 :   case ISD::SETCC:
    8595       10765 :     return performSetCCCombine(N, DCI);
    8596       10329 :   case ISD::FMAXNUM:
    8597             :   case ISD::FMINNUM:
    8598             :   case ISD::SMAX:
    8599             :   case ISD::SMIN:
    8600             :   case ISD::UMAX:
    8601             :   case ISD::UMIN:
    8602             :   case AMDGPUISD::FMIN_LEGACY:
    8603             :   case AMDGPUISD::FMAX_LEGACY: {
    8604       10329 :     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
    8605        3554 :         getTargetMachine().getOptLevel() > CodeGenOpt::None)
    8606        3554 :       return performMinMaxCombine(N, DCI);
    8607             :     break;
    8608             :   }
    8609        1833 :   case ISD::FMA:
    8610        1833 :     return performFMACombine(N, DCI);
    8611             :   case ISD::LOAD: {
    8612      310576 :     if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
    8613          48 :       return Widended;
    8614             :     LLVM_FALLTHROUGH;
    8615             :   }
    8616             :   case ISD::STORE:
    8617             :   case ISD::ATOMIC_LOAD:
    8618             :   case ISD::ATOMIC_STORE:
    8619             :   case ISD::ATOMIC_CMP_SWAP:
    8620             :   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
    8621             :   case ISD::ATOMIC_SWAP:
    8622             :   case ISD::ATOMIC_LOAD_ADD:
    8623             :   case ISD::ATOMIC_LOAD_SUB:
    8624             :   case ISD::ATOMIC_LOAD_AND:
    8625             :   case ISD::ATOMIC_LOAD_OR:
    8626             :   case ISD::ATOMIC_LOAD_XOR:
    8627             :   case ISD::ATOMIC_LOAD_NAND:
    8628             :   case ISD::ATOMIC_LOAD_MIN:
    8629             :   case ISD::ATOMIC_LOAD_MAX:
    8630             :   case ISD::ATOMIC_LOAD_UMIN:
    8631             :   case ISD::ATOMIC_LOAD_UMAX:
    8632             :   case AMDGPUISD::ATOMIC_INC:
    8633             :   case AMDGPUISD::ATOMIC_DEC:
    8634             :   case AMDGPUISD::ATOMIC_LOAD_FADD:
    8635             :   case AMDGPUISD::ATOMIC_LOAD_FMIN:
    8636             :   case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
    8637      641131 :     if (DCI.isBeforeLegalize())
    8638             :       break;
    8639      341305 :     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
    8640       36547 :   case ISD::AND:
    8641       36547 :     return performAndCombine(N, DCI);
    8642       15730 :   case ISD::OR:
    8643       15730 :     return performOrCombine(N, DCI);
    8644        1928 :   case ISD::XOR:
    8645        1928 :     return performXorCombine(N, DCI);
    8646       21073 :   case ISD::ZERO_EXTEND:
    8647       21073 :     return performZeroExtendCombine(N, DCI);
    8648         102 :   case AMDGPUISD::FP_CLASS:
    8649         102 :     return performClassCombine(N, DCI);
    8650        1086 :   case ISD::FCANONICALIZE:
    8651        1086 :     return performFCanonicalizeCombine(N, DCI);
    8652         763 :   case AMDGPUISD::RCP:
    8653         763 :     return performRcpCombine(N, DCI);
    8654         641 :   case AMDGPUISD::FRACT:
    8655             :   case AMDGPUISD::RSQ:
    8656             :   case AMDGPUISD::RCP_LEGACY:
    8657             :   case AMDGPUISD::RSQ_LEGACY:
    8658             :   case AMDGPUISD::RCP_IFLAG:
    8659             :   case AMDGPUISD::RSQ_CLAMP:
    8660             :   case AMDGPUISD::LDEXP: {
    8661         641 :     SDValue Src = N->getOperand(0);
    8662         641 :     if (Src.isUndef())
    8663          10 :       return Src;
    8664             :     break;
    8665             :   }
    8666        1699 :   case ISD::SINT_TO_FP:
    8667             :   case ISD::UINT_TO_FP:
    8668        1699 :     return performUCharToFloatCombine(N, DCI);
    8669         357 :   case AMDGPUISD::CVT_F32_UBYTE0:
    8670             :   case AMDGPUISD::CVT_F32_UBYTE1:
    8671             :   case AMDGPUISD::CVT_F32_UBYTE2:
    8672             :   case AMDGPUISD::CVT_F32_UBYTE3:
    8673         357 :     return performCvtF32UByteNCombine(N, DCI);
    8674         177 :   case AMDGPUISD::FMED3:
    8675         177 :     return performFMed3Combine(N, DCI);
    8676         113 :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    8677         113 :     return performCvtPkRTZCombine(N, DCI);
    8678         334 :   case AMDGPUISD::CLAMP:
    8679         334 :     return performClampCombine(N, DCI);
    8680        2463 :   case ISD::SCALAR_TO_VECTOR: {
    8681        2463 :     SelectionDAG &DAG = DCI.DAG;
    8682        4926 :     EVT VT = N->getValueType(0);
    8683             : 
    8684             :     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
    8685             :     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
    8686             :       SDLoc SL(N);
    8687         247 :       SDValue Src = N->getOperand(0);
    8688             :       EVT EltVT = Src.getValueType();
    8689             :       if (EltVT == MVT::f16)
    8690         164 :         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
    8691             : 
    8692         247 :       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
    8693         247 :       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
    8694             :     }
    8695             : 
    8696        2216 :     break;
    8697             :   }
    8698      266830 :   case ISD::EXTRACT_VECTOR_ELT:
    8699      266830 :     return performExtractVectorEltCombine(N, DCI);
    8700      139608 :   case ISD::BUILD_VECTOR:
    8701      139608 :     return performBuildVectorCombine(N, DCI);
    8702             :   }
    8703      309448 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    8704             : }
    8705             : 
    8706             : /// Helper function for adjustWritemask
    8707             : static unsigned SubIdx2Lane(unsigned Idx) {
    8708             :   switch (Idx) {
    8709             :   default: return 0;
    8710             :   case AMDGPU::sub0: return 0;
    8711             :   case AMDGPU::sub1: return 1;
    8712             :   case AMDGPU::sub2: return 2;
    8713             :   case AMDGPU::sub3: return 3;
    8714             :   }
    8715             : }
    8716             : 
    8717             : /// Adjust the writemask of MIMG instructions
    8718         738 : SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    8719             :                                           SelectionDAG &DAG) const {
    8720         738 :   unsigned Opcode = Node->getMachineOpcode();
    8721             : 
    8722             :   // Subtract 1 because the vdata output is not a MachineSDNode operand.
    8723         738 :   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
    8724        1439 :   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
    8725             :     return Node; // not implemented for D16
    8726             : 
    8727         714 :   SDNode *Users[4] = { nullptr };
    8728             :   unsigned Lane = 0;
    8729         714 :   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
    8730         714 :   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    8731             :   unsigned NewDmask = 0;
    8732         714 :   bool HasChain = Node->getNumValues() > 1;
    8733             : 
    8734         714 :   if (OldDmask == 0) {
    8735             :     // These are folded out, but on the chance it happens don't assert.
    8736             :     return Node;
    8737             :   }
    8738             : 
    8739             :   // Try to figure out the used register components
    8740         714 :   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    8741        2693 :        I != E; ++I) {
    8742             : 
    8743             :     // Don't look at users of the chain.
    8744        2139 :     if (I.getUse().getResNo() != 0)
    8745             :       continue;
    8746             : 
    8747             :     // Abort if we can't understand the usage
    8748        2028 :     if (!I->isMachineOpcode() ||
    8749             :         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    8750             :       return Node;
    8751             : 
    8752             :     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
    8753             :     // Note that subregs are packed, i.e. Lane==0 is the first bit set
    8754             :     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
    8755             :     // set, etc.
    8756        1870 :     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    8757             : 
    8758             :     // Set which texture component corresponds to the lane.
    8759             :     unsigned Comp;
    8760        6292 :     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
    8761        4422 :       Comp = countTrailingZeros(Dmask);
    8762        4422 :       Dmask &= ~(1 << Comp);
    8763             :     }
    8764             : 
    8765             :     // Abort if we have more than one user per component
    8766        1870 :     if (Users[Lane])
    8767             :       return Node;
    8768             : 
    8769        1868 :     Users[Lane] = *I;
    8770        1868 :     NewDmask |= 1 << Comp;
    8771             :   }
    8772             : 
    8773             :   // Abort if there's no change
    8774         554 :   if (NewDmask == OldDmask)
    8775             :     return Node;
    8776             : 
    8777             :   unsigned BitsSet = countPopulation(NewDmask);
    8778             : 
    8779          95 :   int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
    8780             :   assert(NewOpcode != -1 &&
    8781             :          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
    8782             :          "failed to find equivalent MIMG op");
    8783             : 
    8784             :   // Adjust the writemask in the node
    8785             :   SmallVector<SDValue, 12> Ops;
    8786          95 :   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
    8787          95 :   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
    8788         285 :   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
    8789             : 
    8790         190 :   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
    8791             : 
    8792             :   MVT ResultVT = BitsSet == 1 ?
    8793         113 :     SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
    8794             :   SDVTList NewVTList = HasChain ?
    8795          95 :     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
    8796             : 
    8797             : 
    8798          95 :   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
    8799             :                                               NewVTList, Ops);
    8800             : 
    8801          95 :   if (HasChain) {
    8802             :     // Update chain.
    8803         184 :     DAG.setNodeMemRefs(NewNode, Node->memoperands());
    8804         184 :     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
    8805             :   }
    8806             : 
    8807          95 :   if (BitsSet == 1) {
    8808             :     assert(Node->hasNUsesOfValue(1, 0));
    8809          57 :     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
    8810         114 :                                       SDLoc(Node), Users[Lane]->getValueType(0),
    8811             :                                       SDValue(NewNode, 0));
    8812          57 :     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    8813          57 :     return nullptr;
    8814             :   }
    8815             : 
    8816             :   // Update the users of the node with the new indices
    8817         190 :   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    8818         152 :     SDNode *User = Users[i];
    8819         152 :     if (!User)
    8820          56 :       continue;
    8821             : 
    8822          96 :     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
    8823          96 :     DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
    8824             : 
    8825             :     switch (Idx) {
    8826             :     default: break;
    8827             :     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    8828             :     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    8829             :     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    8830             :     }
    8831             :   }
    8832             : 
    8833          38 :   DAG.RemoveDeadNode(Node);
    8834          38 :   return nullptr;
    8835             : }
    8836             : 
    8837             : static bool isFrameIndexOp(SDValue Op) {
    8838      440176 :   if (Op.getOpcode() == ISD::AssertZext)
    8839          70 :     Op = Op.getOperand(0);
    8840             : 
    8841             :   return isa<FrameIndexSDNode>(Op);
    8842             : }
    8843             : 
    8844             : /// Legalize target independent instructions (e.g. INSERT_SUBREG)
    8845             : /// with frame index operands.
    8846             : /// LLVM assumes that inputs are to these instructions are registers.
    8847       75731 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
    8848             :                                                         SelectionDAG &DAG) const {
    8849       75731 :   if (Node->getOpcode() == ISD::CopyToReg) {
    8850       17183 :     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
    8851       17183 :     SDValue SrcVal = Node->getOperand(2);
    8852             : 
    8853             :     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
    8854             :     // to try understanding copies to physical registers.
    8855         187 :     if (SrcVal.getValueType() == MVT::i1 &&
    8856         187 :         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
    8857             :       SDLoc SL(Node);
    8858           8 :       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    8859             :       SDValue VReg = DAG.getRegister(
    8860           8 :         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
    8861             : 
    8862             :       SDNode *Glued = Node->getGluedNode();
    8863             :       SDValue ToVReg
    8864           8 :         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
    8865          10 :                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
    8866             :       SDValue ToResultReg
    8867             :         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
    8868          16 :                            VReg, ToVReg.getValue(1));
    8869           8 :       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
    8870           8 :       DAG.RemoveDeadNode(Node);
    8871             :       return ToResultReg.getNode();
    8872             :     }
    8873             :   }
    8874             : 
    8875             :   SmallVector<SDValue, 8> Ops;
    8876      591622 :   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
    8877      880352 :     if (!isFrameIndexOp(Node->getOperand(i))) {
    8878      440151 :       Ops.push_back(Node->getOperand(i));
    8879      440151 :       continue;
    8880             :     }
    8881             : 
    8882             :     SDLoc DL(Node);
    8883          50 :     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
    8884             :                                      Node->getOperand(i).getValueType(),
    8885          25 :                                      Node->getOperand(i)), 0));
    8886             :   }
    8887             : 
    8888       75723 :   return DAG.UpdateNodeOperands(Node, Ops);
    8889             : }
    8890             : 
    8891             : /// Fold the instructions after selecting them.
    8892             : /// Returns null if users were already updated.
    8893      428230 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
    8894             :                                           SelectionDAG &DAG) const {
    8895      428230 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8896      428230 :   unsigned Opcode = Node->getMachineOpcode();
    8897             : 
    8898      856460 :   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
    8899             :       !TII->isGather4(Opcode)) {
    8900         738 :     return adjustWritemask(Node, DAG);
    8901             :   }
    8902             : 
    8903      854984 :   if (Opcode == AMDGPU::INSERT_SUBREG ||
    8904      427492 :       Opcode == AMDGPU::REG_SEQUENCE) {
    8905       58548 :     legalizeTargetIndependentNode(Node, DAG);
    8906       58548 :     return Node;
    8907             :   }
    8908             : 
    8909      368944 :   switch (Opcode) {
    8910         283 :   case AMDGPU::V_DIV_SCALE_F32:
    8911             :   case AMDGPU::V_DIV_SCALE_F64: {
    8912             :     // Satisfy the operand register constraint when one of the inputs is
    8913             :     // undefined. Ordinarily each undef value will have its own implicit_def of
    8914             :     // a vreg, so force these to use a single register.
    8915         283 :     SDValue Src0 = Node->getOperand(0);
    8916         283 :     SDValue Src1 = Node->getOperand(1);
    8917         283 :     SDValue Src2 = Node->getOperand(2);
    8918             : 
    8919         275 :     if ((Src0.isMachineOpcode() &&
    8920         283 :          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
    8921             :         (Src0 == Src1 || Src0 == Src2))
    8922             :       break;
    8923             : 
    8924             :     MVT VT = Src0.getValueType().getSimpleVT();
    8925          11 :     const TargetRegisterClass *RC = getRegClassFor(VT);
    8926             : 
    8927          11 :     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    8928          11 :     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
    8929             : 
    8930          22 :     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
    8931          22 :                                       UndefReg, Src0, SDValue());
    8932             : 
    8933             :     // src0 must be the same register as src1 or src2, even if the value is
    8934             :     // undefined, so make sure we don't violate this constraint.
    8935          11 :     if (Src0.isMachineOpcode() &&
    8936             :         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
    8937           3 :       if (Src1.isMachineOpcode() &&
    8938             :           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    8939             :         Src0 = Src1;
    8940           3 :       else if (Src2.isMachineOpcode() &&
    8941             :                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    8942             :         Src0 = Src2;
    8943             :       else {
    8944             :         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
    8945           1 :         Src0 = UndefReg;
    8946           1 :         Src1 = UndefReg;
    8947             :       }
    8948             :     } else
    8949             :       break;
    8950             : 
    8951           3 :     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
    8952           3 :     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
    8953           0 :       Ops.push_back(Node->getOperand(I));
    8954             : 
    8955           3 :     Ops.push_back(ImpDef.getValue(1));
    8956           6 :     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    8957             :   }
    8958             :   default:
    8959             :     break;
    8960             :   }
    8961             : 
    8962      368941 :   return Node;
    8963             : }
    8964             : 
    8965             : /// Assign the register class depending on the number of
    8966             : /// bits set in the writemask
    8967       41573 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
    8968             :                                                      SDNode *Node) const {
    8969       41573 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8970             : 
    8971       41573 :   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    8972             : 
    8973      124719 :   if (TII->isVOP3(MI.getOpcode())) {
    8974             :     // Make sure constant bus requirements are respected.
    8975       38943 :     TII->legalizeOperandsVOP3(MRI, MI);
    8976       38943 :     return;
    8977             :   }
    8978             : 
    8979             :   // Replace unused atomics with the no return version.
    8980        2630 :   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
    8981        2630 :   if (NoRetAtomicOp != -1) {
    8982        2077 :     if (!Node->hasAnyUseOfValue(0)) {
    8983         950 :       MI.setDesc(TII->get(NoRetAtomicOp));
    8984         950 :       MI.RemoveOperand(0);
    8985         950 :       return;
    8986             :     }
    8987             : 
    8988             :     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
    8989             :     // instruction, because the return type of these instructions is a vec2 of
    8990             :     // the memory type, so it can be tied to the input operand.
    8991             :     // This means these instructions always have a use, so we need to add a
    8992             :     // special case to check if the atomic has only one extract_subreg use,
    8993             :     // which itself has no uses.
    8994        1127 :     if ((Node->hasNUsesOfValue(1, 0) &&
    8995        2236 :          Node->use_begin()->isMachineOpcode() &&
    8996        1159 :          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
    8997          32 :          !Node->use_begin()->hasAnyUseOfValue(0))) {
    8998           0 :       unsigned Def = MI.getOperand(0).getReg();
    8999             : 
    9000             :       // Change this into a noret atomic.
    9001           0 :       MI.setDesc(TII->get(NoRetAtomicOp));
    9002           0 :       MI.RemoveOperand(0);
    9003             : 
    9004             :       // If we only remove the def operand from the atomic instruction, the
    9005             :       // extract_subreg will be left with a use of a vreg without a def.
    9006             :       // So we need to insert an implicit_def to avoid machine verifier
    9007             :       // errors.
    9008           0 :       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
    9009           0 :               TII->get(AMDGPU::IMPLICIT_DEF), Def);
    9010             :     }
    9011        1127 :     return;
    9012             :   }
    9013             : }
    9014             : 
    9015       41816 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
    9016             :                               uint64_t Val) {
    9017       41816 :   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
    9018       41816 :   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
    9019             : }
    9020             : 
    9021        3633 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
    9022             :                                                 const SDLoc &DL,
    9023             :                                                 SDValue Ptr) const {
    9024        3633 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    9025             : 
    9026             :   // Build the half of the subregister with the constants before building the
    9027             :   // full 128-bit register. If we are building multiple resource descriptors,
    9028             :   // this will allow CSEing of the 2-component register.
    9029             :   const SDValue Ops0[] = {
    9030        3633 :     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
    9031        3633 :     buildSMovImm32(DAG, DL, 0),
    9032        3633 :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    9033        3633 :     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
    9034        3633 :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
    9035             :   };
    9036             : 
    9037        3633 :   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
    9038             :                                                 MVT::v2i32, Ops0), 0);
    9039             : 
    9040             :   // Combine the constants and the pointer.
    9041             :   const SDValue Ops1[] = {
    9042        3633 :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    9043             :     Ptr,
    9044        3633 :     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
    9045             :     SubRegHi,
    9046        3633 :     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
    9047        3633 :   };
    9048             : 
    9049        3633 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
    9050             : }
    9051             : 
    9052             : /// Return a resource descriptor with the 'Add TID' bit enabled
    9053             : ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
    9054             : ///        of the resource descriptor) to create an offset, which is added to
    9055             : ///        the resource pointer.
    9056       17275 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
    9057             :                                            SDValue Ptr, uint32_t RsrcDword1,
    9058             :                                            uint64_t RsrcDword2And3) const {
    9059       17275 :   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
    9060       17275 :   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
    9061       17275 :   if (RsrcDword1) {
    9062           0 :     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
    9063             :                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
    9064             :                     0);
    9065             :   }
    9066             : 
    9067             :   SDValue DataLo = buildSMovImm32(DAG, DL,
    9068       17275 :                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
    9069       17275 :   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
    9070             : 
    9071             :   const SDValue Ops[] = {
    9072       17275 :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    9073             :     PtrLo,
    9074       17275 :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    9075             :     PtrHi,
    9076       17275 :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
    9077             :     DataLo,
    9078       17275 :     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
    9079             :     DataHi,
    9080       17275 :     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
    9081       17275 :   };
    9082             : 
    9083       17275 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
    9084             : }
    9085             : 
    9086             : //===----------------------------------------------------------------------===//
    9087             : //                         SI Inline Assembly Support
    9088             : //===----------------------------------------------------------------------===//
    9089             : 
    9090             : std::pair<unsigned, const TargetRegisterClass *>
    9091        2343 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    9092             :                                                StringRef Constraint,
    9093             :                                                MVT VT) const {
    9094             :   const TargetRegisterClass *RC = nullptr;
    9095        2343 :   if (Constraint.size() == 1) {
    9096        1494 :     switch (Constraint[0]) {
    9097           0 :     default:
    9098           0 :       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    9099         362 :     case 's':
    9100             :     case 'r':
    9101         362 :       switch (VT.getSizeInBits()) {
    9102           6 :       default:
    9103           6 :         return std::make_pair(0U, nullptr);
    9104             :       case 32:
    9105             :       case 16:
    9106             :         RC = &AMDGPU::SReg_32_XM0RegClass;
    9107             :         break;
    9108          77 :       case 64:
    9109             :         RC = &AMDGPU::SGPR_64RegClass;
    9110          77 :         break;
    9111          18 :       case 128:
    9112             :         RC = &AMDGPU::SReg_128RegClass;
    9113          18 :         break;
    9114          48 :       case 256:
    9115             :         RC = &AMDGPU::SReg_256RegClass;
    9116          48 :         break;
    9117          32 :       case 512:
    9118             :         RC = &AMDGPU::SReg_512RegClass;
    9119          32 :         break;
    9120             :       }
    9121             :       break;
    9122         385 :     case 'v':
    9123         385 :       switch (VT.getSizeInBits()) {
    9124           6 :       default:
    9125           6 :         return std::make_pair(0U, nullptr);
    9126             :       case 32:
    9127             :       case 16:
    9128             :         RC = &AMDGPU::VGPR_32RegClass;
    9129             :         break;
    9130          58 :       case 64:
    9131             :         RC = &AMDGPU::VReg_64RegClass;
    9132          58 :         break;
    9133           0 :       case 96:
    9134             :         RC = &AMDGPU::VReg_96RegClass;
    9135           0 :         break;
    9136          23 :       case 128:
    9137             :         RC = &AMDGPU::VReg_128RegClass;
    9138          23 :         break;
    9139           0 :       case 256:
    9140             :         RC = &AMDGPU::VReg_256RegClass;
    9141           0 :         break;
    9142           0 :       case 512:
    9143             :         RC = &AMDGPU::VReg_512RegClass;
    9144           0 :         break;
    9145             :       }
    9146             :       break;
    9147             :     }
    9148             :     // We actually support i128, i16 and f16 as inline parameters
    9149             :     // even if they are not reported as legal
    9150          38 :     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
    9151          26 :                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
    9152             :       return std::make_pair(0U, RC);
    9153             :   }
    9154             : 
    9155        1614 :   if (Constraint.size() > 1) {
    9156        3192 :     if (Constraint[1] == 'v') {
    9157             :       RC = &AMDGPU::VGPR_32RegClass;
    9158         724 :     } else if (Constraint[1] == 's') {
    9159             :       RC = &AMDGPU::SGPR_32RegClass;
    9160             :     }
    9161             : 
    9162        1596 :     if (RC) {
    9163             :       uint32_t Idx;
    9164        1430 :       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
    9165        1430 :       if (!Failed && Idx < RC->getNumRegs())
    9166             :         return std::make_pair(RC->getRegister(Idx), RC);
    9167             :     }
    9168             :   }
    9169        1614 :   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    9170             : }
    9171             : 
    9172             : SITargetLowering::ConstraintType
    9173        7630 : SITargetLowering::getConstraintType(StringRef Constraint) const {
    9174        7630 :   if (Constraint.size() == 1) {
    9175        5702 :     switch (Constraint[0]) {
    9176             :     default: break;
    9177             :     case 's':
    9178             :     case 'v':
    9179             :       return C_RegisterClass;
    9180             :     }
    9181             :   }
    9182        4979 :   return TargetLowering::getConstraintType(Constraint);
    9183             : }
    9184             : 
    9185             : // Figure out which registers should be reserved for stack access. Only after
    9186             : // the function is legalized do we know all of the non-spill stack objects or if
    9187             : // calls are present.
    9188       19746 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    9189       19746 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    9190       19746 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    9191       19746 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
    9192       19746 :   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    9193             : 
    9194       19746 :   if (Info->isEntryFunction()) {
    9195             :     // Callable functions have fixed registers used for stack access.
    9196       17974 :     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
    9197             :   }
    9198             : 
    9199             :   // We have to assume the SP is needed in case there are calls in the function
    9200             :   // during lowering. Calls are only detected after the function is
    9201             :   // lowered. We're about to reserve registers, so don't bother using it if we
    9202             :   // aren't really going to use it.
    9203       37720 :   bool NeedSP = !Info->isEntryFunction() ||
    9204       19746 :     MFI.hasVarSizedObjects() ||
    9205       17972 :     MFI.hasCalls();
    9206             : 
    9207             :   if (NeedSP) {
    9208        2193 :     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
    9209             :     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
    9210             : 
    9211             :     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
    9212             :     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
    9213             :                                Info->getStackPtrOffsetReg()));
    9214        2193 :     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
    9215             :   }
    9216             : 
    9217       19746 :   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
    9218       19746 :   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
    9219       19746 :   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
    9220             :                      Info->getScratchWaveOffsetReg());
    9221             : 
    9222       19746 :   Info->limitOccupancy(MF);
    9223             : 
    9224       19746 :   TargetLoweringBase::finalizeLowering(MF);
    9225       19746 : }
    9226             : 
    9227      447083 : void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
    9228             :                                                      KnownBits &Known,
    9229             :                                                      const APInt &DemandedElts,
    9230             :                                                      const SelectionDAG &DAG,
    9231             :                                                      unsigned Depth) const {
    9232      447083 :   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
    9233             :                                                 DAG, Depth);
    9234             : 
    9235      447083 :   if (getSubtarget()->enableHugePrivateBuffer())
    9236             :     return;
    9237             : 
    9238             :   // Technically it may be possible to have a dispatch with a single workitem
    9239             :   // that uses the full private memory size, but that's not really useful. We
    9240             :   // can't use vaddr in MUBUF instructions if we don't know the address
    9241             :   // calculation won't overflow, so assume the sign bit is never set.
    9242      447075 :   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
    9243             : }
    9244             : 
    9245     2199040 : bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
    9246             :   FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
    9247             : {
    9248     4398080 :   switch (N->getOpcode()) {
    9249      144606 :     case ISD::Register:
    9250             :     case ISD::CopyFromReg:
    9251             :     {
    9252             :       const RegisterSDNode *R = nullptr;
    9253      144606 :       if (N->getOpcode() == ISD::Register) {
    9254             :         R = dyn_cast<RegisterSDNode>(N);
    9255             :       }
    9256             :       else {
    9257       62448 :         R = dyn_cast<RegisterSDNode>(N->getOperand(1));
    9258             :       }
    9259      144606 :       if (R)
    9260             :       {
    9261      144606 :         const MachineFunction * MF = FLI->MF;
    9262      144606 :         const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
    9263      144606 :         const MachineRegisterInfo &MRI = MF->getRegInfo();
    9264      144606 :         const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
    9265      144606 :         unsigned Reg = R->getReg();
    9266      144606 :         if (TRI.isPhysicalRegister(Reg))
    9267       16970 :           return TRI.isVGPR(MRI, Reg);
    9268             : 
    9269      127636 :         if (MRI.isLiveIn(Reg)) {
    9270             :           // workitem.id.x workitem.id.y workitem.id.z
    9271             :           // Any VGPR formal argument is also considered divergent
    9272      107680 :           if (TRI.isVGPR(MRI, Reg))
    9273             :               return true;
    9274             :           // Formal arguments of non-entry functions
    9275             :           // are conservatively considered divergent
    9276      160228 :           else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
    9277             :             return true;
    9278             :         }
    9279       96322 :         return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg));
    9280           0 :       }
    9281             :     }
    9282             :     break;
    9283             :     case ISD::LOAD: {
    9284             :       const LoadSDNode *L = cast<LoadSDNode>(N);
    9285             :       unsigned AS = L->getAddressSpace();
    9286             :       // A flat load may access private memory.
    9287      137717 :       return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
    9288             :     } break;
    9289             :     case ISD::CALLSEQ_END:
    9290             :     return true;
    9291             :     break;
    9292       22034 :     case ISD::INTRINSIC_WO_CHAIN:
    9293             :     {
    9294             : 
    9295             :     }
    9296       22034 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    9297       66102 :       cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
    9298        2207 :     case ISD::INTRINSIC_W_CHAIN:
    9299        2207 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    9300        6621 :       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
    9301             :     // In some cases intrinsics that are a source of divergence have been
    9302             :     // lowered to AMDGPUISD so we also need to check those too.
    9303             :     case AMDGPUISD::INTERP_MOV:
    9304             :     case AMDGPUISD::INTERP_P1:
    9305             :     case AMDGPUISD::INTERP_P2:
    9306             :       return true;
    9307             :   }
    9308             :   return false;
    9309             : }
    9310             : 
    9311         720 : bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
    9312         720 :   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
    9313         334 :   case MVT::f32:
    9314         334 :     return Subtarget->hasFP32Denormals();
    9315          69 :   case MVT::f64:
    9316          69 :     return Subtarget->hasFP64Denormals();
    9317         317 :   case MVT::f16:
    9318         317 :     return Subtarget->hasFP16Denormals();
    9319             :   default:
    9320             :     return false;
    9321             :   }
    9322             : }

Generated by: LCOV version 1.13