LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 3113 3295 94.5 %
Date: 2018-07-13 00:08:38 Functions: 160 161 99.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Custom DAG lowering for SI
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifdef _MSC_VER
      16             : // Provide M_PI.
      17             : #define _USE_MATH_DEFINES
      18             : #endif
      19             : 
      20             : #include "SIISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUIntrinsicInfo.h"
      23             : #include "AMDGPUSubtarget.h"
      24             : #include "AMDGPUTargetMachine.h"
      25             : #include "SIDefines.h"
      26             : #include "SIInstrInfo.h"
      27             : #include "SIMachineFunctionInfo.h"
      28             : #include "SIRegisterInfo.h"
      29             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      30             : #include "Utils/AMDGPUBaseInfo.h"
      31             : #include "llvm/ADT/APFloat.h"
      32             : #include "llvm/ADT/APInt.h"
      33             : #include "llvm/ADT/ArrayRef.h"
      34             : #include "llvm/ADT/BitVector.h"
      35             : #include "llvm/ADT/SmallVector.h"
      36             : #include "llvm/ADT/Statistic.h"
      37             : #include "llvm/ADT/StringRef.h"
      38             : #include "llvm/ADT/StringSwitch.h"
      39             : #include "llvm/ADT/Twine.h"
      40             : #include "llvm/CodeGen/Analysis.h"
      41             : #include "llvm/CodeGen/CallingConvLower.h"
      42             : #include "llvm/CodeGen/DAGCombine.h"
      43             : #include "llvm/CodeGen/ISDOpcodes.h"
      44             : #include "llvm/CodeGen/MachineBasicBlock.h"
      45             : #include "llvm/CodeGen/MachineFrameInfo.h"
      46             : #include "llvm/CodeGen/MachineFunction.h"
      47             : #include "llvm/CodeGen/MachineInstr.h"
      48             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      49             : #include "llvm/CodeGen/MachineMemOperand.h"
      50             : #include "llvm/CodeGen/MachineModuleInfo.h"
      51             : #include "llvm/CodeGen/MachineOperand.h"
      52             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      53             : #include "llvm/CodeGen/SelectionDAG.h"
      54             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      55             : #include "llvm/CodeGen/TargetCallingConv.h"
      56             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      57             : #include "llvm/CodeGen/ValueTypes.h"
      58             : #include "llvm/IR/Constants.h"
      59             : #include "llvm/IR/DataLayout.h"
      60             : #include "llvm/IR/DebugLoc.h"
      61             : #include "llvm/IR/DerivedTypes.h"
      62             : #include "llvm/IR/DiagnosticInfo.h"
      63             : #include "llvm/IR/Function.h"
      64             : #include "llvm/IR/GlobalValue.h"
      65             : #include "llvm/IR/InstrTypes.h"
      66             : #include "llvm/IR/Instruction.h"
      67             : #include "llvm/IR/Instructions.h"
      68             : #include "llvm/IR/IntrinsicInst.h"
      69             : #include "llvm/IR/Type.h"
      70             : #include "llvm/Support/Casting.h"
      71             : #include "llvm/Support/CodeGen.h"
      72             : #include "llvm/Support/CommandLine.h"
      73             : #include "llvm/Support/Compiler.h"
      74             : #include "llvm/Support/ErrorHandling.h"
      75             : #include "llvm/Support/KnownBits.h"
      76             : #include "llvm/Support/MachineValueType.h"
      77             : #include "llvm/Support/MathExtras.h"
      78             : #include "llvm/Target/TargetOptions.h"
      79             : #include <cassert>
      80             : #include <cmath>
      81             : #include <cstdint>
      82             : #include <iterator>
      83             : #include <tuple>
      84             : #include <utility>
      85             : #include <vector>
      86             : 
      87             : using namespace llvm;
      88             : 
      89             : #define DEBUG_TYPE "si-lower"
      90             : 
      91             : STATISTIC(NumTailCalls, "Number of tail calls");
      92             : 
      93       99743 : static cl::opt<bool> EnableVGPRIndexMode(
      94             :   "amdgpu-vgpr-index-mode",
      95       99743 :   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
      96      299229 :   cl::init(false));
      97             : 
      98       99743 : static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
      99             :   "amdgpu-frame-index-zero-bits",
     100       99743 :   cl::desc("High bits of frame index assumed to be zero"),
     101      199486 :   cl::init(5),
     102      299229 :   cl::ReallyHidden);
     103             : 
     104             : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
     105          45 :   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
     106         373 :   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
     107         418 :     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
     108             :       return AMDGPU::SGPR0 + Reg;
     109             :     }
     110             :   }
     111           0 :   llvm_unreachable("Cannot allocate sgpr");
     112             : }
     113             : 
     114        2271 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
     115        2271 :                                    const SISubtarget &STI)
     116             :     : AMDGPUTargetLowering(TM, STI),
     117        2271 :       Subtarget(&STI) {
     118             :   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
     119             :   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
     120             : 
     121             :   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
     122             :   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
     123             : 
     124             :   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
     125             :   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
     126             :   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
     127             : 
     128             :   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
     129             :   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
     130             : 
     131             :   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
     132             :   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     133             : 
     134             :   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
     135             :   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     136             : 
     137             :   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
     138             :   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     139             : 
     140        2271 :   if (Subtarget->has16BitInsts()) {
     141             :     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     142             :     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
     143             : 
     144             :     // Unless there are also VOP3P operations, not operations are really legal.
     145             :     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     146             :     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
     147             :     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     148             :     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     149             :   }
     150             : 
     151        4542 :   computeRegisterProperties(Subtarget->getRegisterInfo());
     152             : 
     153             :   // We need to custom lower vector stores from local memory
     154             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     155             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     156             :   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
     157             :   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
     158             :   setOperationAction(ISD::LOAD, MVT::i1, Custom);
     159             : 
     160             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     161             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     162             :   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
     163             :   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
     164             :   setOperationAction(ISD::STORE, MVT::i1, Custom);
     165             : 
     166             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     167             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
     168             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
     169             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
     170             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
     171             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
     172             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
     173             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
     174             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
     175             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
     176             : 
     177             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     178             :   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     179             : 
     180             :   setOperationAction(ISD::SELECT, MVT::i1, Promote);
     181             :   setOperationAction(ISD::SELECT, MVT::i64, Custom);
     182             :   setOperationAction(ISD::SELECT, MVT::f64, Promote);
     183             :   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
     184             : 
     185             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     186             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
     187             :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     188             :   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     189             :   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
     190             : 
     191             :   setOperationAction(ISD::SETCC, MVT::i1, Promote);
     192             :   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     193             :   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     194             :   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
     195             : 
     196             :   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
     197             :   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
     198             : 
     199             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
     200             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
     201             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
     202             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
     203             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
     204             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
     205             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
     206             : 
     207             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     208             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
     209             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
     210             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
     211             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
     212             : 
     213             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
     214             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
     215             :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
     216             : 
     217             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     218             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
     219             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
     220             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
     221             : 
     222             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     223             :   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
     224             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     225             :   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
     226             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     227             :   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     228             : 
     229             :   setOperationAction(ISD::UADDO, MVT::i32, Legal);
     230             :   setOperationAction(ISD::USUBO, MVT::i32, Legal);
     231             : 
     232             :   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
     233             :   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
     234             : 
     235             : #if 0
     236             :   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
     237             :   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
     238             : #endif
     239             : 
     240             :   // We only support LOAD/STORE and vector manipulation ops for vectors
     241             :   // with > 4 elements.
     242       18168 :   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
     243       20439 :         MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
     244     9392856 :     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     245     4687344 :       switch (Op) {
     246             :       case ISD::LOAD:
     247             :       case ISD::STORE:
     248             :       case ISD::BUILD_VECTOR:
     249             :       case ISD::BITCAST:
     250             :       case ISD::EXTRACT_VECTOR_ELT:
     251             :       case ISD::INSERT_VECTOR_ELT:
     252             :       case ISD::INSERT_SUBVECTOR:
     253             :       case ISD::EXTRACT_SUBVECTOR:
     254             :       case ISD::SCALAR_TO_VECTOR:
     255             :         break;
     256       18168 :       case ISD::CONCAT_VECTORS:
     257             :         setOperationAction(Op, VT, Custom);
     258             :         break;
     259     4505664 :       default:
     260             :         setOperationAction(Op, VT, Expand);
     261             :         break;
     262             :       }
     263             :     }
     264             :   }
     265             : 
     266             :   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
     267             : 
     268             :   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
     269             :   // is expanded to avoid having two separate loops in case the index is a VGPR.
     270             : 
     271             :   // Most operations are naturally 32-bit vector operations. We only support
     272             :   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
     273       11355 :   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
     274             :     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
     275             :     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     276             : 
     277             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     278             :     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
     279             : 
     280             :     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
     281             :     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
     282             : 
     283             :     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
     284             :     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
     285             :   }
     286             : 
     287             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     288             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     289             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     290             :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     291             : 
     292             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
     293             :   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
     294             : 
     295             :   // Avoid stack access for these.
     296             :   // TODO: Generalize to more vector types.
     297             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
     298             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
     299             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     300             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     301             : 
     302             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     303             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     304             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
     305             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
     306             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
     307             : 
     308             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
     309             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
     310             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
     311             : 
     312             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
     313             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
     314             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
     315             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
     316             : 
     317             :   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
     318             :   // and output demarshalling
     319             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
     320             :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
     321             : 
     322             :   // We can't return success/failure, only the old value,
     323             :   // let LLVM add the comparison
     324             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
     325             :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
     326             : 
     327        2271 :   if (Subtarget->hasFlatAddressSpace()) {
     328             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     329             :     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
     330             :   }
     331             : 
     332             :   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
     333             :   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
     334             : 
     335             :   // On SI this is s_memtime and s_memrealtime on VI.
     336             :   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
     337             :   setOperationAction(ISD::TRAP, MVT::Other, Custom);
     338             :   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
     339             : 
     340        2271 :   if (Subtarget->has16BitInsts()) {
     341             :     setOperationAction(ISD::FLOG, MVT::f16, Custom);
     342             :     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
     343             :   }
     344             : 
     345             :   // v_mad_f32 does not support denormals according to some sources.
     346        2271 :   if (!Subtarget->hasFP32Denormals())
     347             :     setOperationAction(ISD::FMAD, MVT::f32, Legal);
     348             : 
     349             :   if (!Subtarget->hasBFI()) {
     350             :     // fcopysign can be done in a single instruction with BFI.
     351             :     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
     352             :     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     353             :   }
     354             : 
     355             :   if (!Subtarget->hasBCNT(32))
     356             :     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
     357             : 
     358             :   if (!Subtarget->hasBCNT(64))
     359             :     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     360             : 
     361             :   if (Subtarget->hasFFBH())
     362             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
     363             : 
     364             :   if (Subtarget->hasFFBL())
     365             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
     366             : 
     367             :   // We only really have 32-bit BFE instructions (and 16-bit on VI).
     368             :   //
     369             :   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
     370             :   // effort to match them now. We want this to be false for i64 cases when the
     371             :   // extraction isn't restricted to the upper or lower half. Ideally we would
     372             :   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
     373             :   // span the midpoint are probably relatively rare, so don't worry about them
     374             :   // for now.
     375             :   if (Subtarget->hasBFE())
     376             :     setHasExtractBitsInsn(true);
     377             : 
     378             :   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     379             :   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     380             : 
     381        2271 :   if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     382             :     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     383             :     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     384             :     setOperationAction(ISD::FRINT, MVT::f64, Legal);
     385             :   } else {
     386             :     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
     387             :     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
     388             :     setOperationAction(ISD::FRINT, MVT::f64, Custom);
     389             :     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
     390             :   }
     391             : 
     392             :   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     393             : 
     394             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     395             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     396             :   setOperationAction(ISD::FDIV, MVT::f32, Custom);
     397             :   setOperationAction(ISD::FDIV, MVT::f64, Custom);
     398             : 
     399        2271 :   if (Subtarget->has16BitInsts()) {
     400             :     setOperationAction(ISD::Constant, MVT::i16, Legal);
     401             : 
     402             :     setOperationAction(ISD::SMIN, MVT::i16, Legal);
     403             :     setOperationAction(ISD::SMAX, MVT::i16, Legal);
     404             : 
     405             :     setOperationAction(ISD::UMIN, MVT::i16, Legal);
     406             :     setOperationAction(ISD::UMAX, MVT::i16, Legal);
     407             : 
     408             :     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     409             :     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
     410             : 
     411             :     setOperationAction(ISD::ROTR, MVT::i16, Promote);
     412             :     setOperationAction(ISD::ROTL, MVT::i16, Promote);
     413             : 
     414             :     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     415             :     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     416             :     setOperationAction(ISD::SREM, MVT::i16, Promote);
     417             :     setOperationAction(ISD::UREM, MVT::i16, Promote);
     418             : 
     419             :     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     420             :     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
     421             : 
     422             :     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
     423             :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     424             :     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     425             :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
     426             :     setOperationAction(ISD::CTPOP, MVT::i16, Promote);
     427             : 
     428             :     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
     429             : 
     430             :     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
     431             : 
     432             :     setOperationAction(ISD::LOAD, MVT::i16, Custom);
     433             : 
     434             :     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     435             : 
     436             :     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
     437             :     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
     438             :     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     439             :     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
     440             : 
     441             :     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
     442             :     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     443             :     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
     444             :     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     445             : 
     446             :     // F16 - Constant Actions.
     447             :     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
     448             : 
     449             :     // F16 - Load/Store Actions.
     450             :     setOperationAction(ISD::LOAD, MVT::f16, Promote);
     451             :     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
     452             :     setOperationAction(ISD::STORE, MVT::f16, Promote);
     453             :     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
     454             : 
     455             :     // F16 - VOP1 Actions.
     456             :     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
     457             :     setOperationAction(ISD::FCOS, MVT::f16, Promote);
     458             :     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     459             :     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
     460             :     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     461             :     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     462             :     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
     463             :     setOperationAction(ISD::FROUND, MVT::f16, Custom);
     464             : 
     465             :     // F16 - VOP2 Actions.
     466             :     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     467             :     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     468             :     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     469             :     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     470             :     setOperationAction(ISD::FDIV, MVT::f16, Custom);
     471             : 
     472             :     // F16 - VOP3 Actions.
     473             :     setOperationAction(ISD::FMA, MVT::f16, Legal);
     474        1106 :     if (!Subtarget->hasFP16Denormals())
     475             :       setOperationAction(ISD::FMAD, MVT::f16, Legal);
     476             : 
     477        9954 :     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
     478     2287208 :       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     479     1141392 :         switch (Op) {
     480             :         case ISD::LOAD:
     481             :         case ISD::STORE:
     482             :         case ISD::BUILD_VECTOR:
     483             :         case ISD::BITCAST:
     484             :         case ISD::EXTRACT_VECTOR_ELT:
     485             :         case ISD::INSERT_VECTOR_ELT:
     486             :         case ISD::INSERT_SUBVECTOR:
     487             :         case ISD::EXTRACT_SUBVECTOR:
     488             :         case ISD::SCALAR_TO_VECTOR:
     489             :           break;
     490        4424 :         case ISD::CONCAT_VECTORS:
     491             :           setOperationAction(Op, VT, Custom);
     492             :           break;
     493     1097152 :         default:
     494             :           setOperationAction(Op, VT, Expand);
     495             :           break;
     496             :         }
     497             :       }
     498             :     }
     499             : 
     500             :     // XXX - Do these do anything? Vector constants turn into build_vector.
     501             :     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     502             :     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
     503             : 
     504             :     setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
     505             :     setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
     506             : 
     507             :     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     508             :     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     509             :     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
     510             :     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
     511             : 
     512             :     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
     513             :     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
     514             :     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
     515             :     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
     516             : 
     517             :     setOperationAction(ISD::AND, MVT::v2i16, Promote);
     518             :     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
     519             :     setOperationAction(ISD::OR, MVT::v2i16, Promote);
     520             :     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     521             :     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     522             :     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
     523             : 
     524             :     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
     525             :     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
     526             :     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
     527             :     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
     528             : 
     529             :     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
     530             :     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
     531             :     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     532             :     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
     533             : 
     534             :     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
     535             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     536             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
     537             :     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
     538             : 
     539             :     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
     540             :     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
     541             :     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
     542             : 
     543        1106 :     if (!Subtarget->hasVOP3PInsts()) {
     544             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
     545             :       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
     546             :     }
     547             : 
     548             :     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     549             :     // This isn't really legal, but this avoids the legalizer unrolling it (and
     550             :     // allows matching fneg (fabs x) patterns)
     551             :     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
     552             :   }
     553             : 
     554        2271 :   if (Subtarget->hasVOP3PInsts()) {
     555             :     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     556             :     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     557             :     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
     558             :     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
     559             :     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
     560             :     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
     561             :     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
     562             :     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
     563             :     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     564             :     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
     565             : 
     566             :     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     567             :     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     568             :     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     569             :     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     570             :     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
     571             :     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
     572             : 
     573             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     574             :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     575             : 
     576             :     setOperationAction(ISD::SHL, MVT::v4i16, Custom);
     577             :     setOperationAction(ISD::SRA, MVT::v4i16, Custom);
     578             :     setOperationAction(ISD::SRL, MVT::v4i16, Custom);
     579             :     setOperationAction(ISD::ADD, MVT::v4i16, Custom);
     580             :     setOperationAction(ISD::SUB, MVT::v4i16, Custom);
     581             :     setOperationAction(ISD::MUL, MVT::v4i16, Custom);
     582             : 
     583             :     setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
     584             :     setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
     585             :     setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
     586             :     setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
     587             : 
     588             :     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     589             :     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
     590             :     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     591             :     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     592             : 
     593             :     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
     594             :     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
     595             :   }
     596             : 
     597             :   setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
     598             :   setOperationAction(ISD::FABS, MVT::v4f16, Custom);
     599             : 
     600        2271 :   if (Subtarget->has16BitInsts()) {
     601             :     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
     602             :     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
     603             :     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
     604             :     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
     605             :   } else {
     606             :     // Legalization hack.
     607             :     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     608             :     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
     609             : 
     610             :     setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
     611             :     setOperationAction(ISD::FABS, MVT::v2f16, Custom);
     612             :   }
     613             : 
     614       24981 :   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
     615             :     setOperationAction(ISD::SELECT, VT, Custom);
     616             :   }
     617             : 
     618             :   setTargetDAGCombine(ISD::ADD);
     619             :   setTargetDAGCombine(ISD::ADDCARRY);
     620             :   setTargetDAGCombine(ISD::SUB);
     621             :   setTargetDAGCombine(ISD::SUBCARRY);
     622             :   setTargetDAGCombine(ISD::FADD);
     623             :   setTargetDAGCombine(ISD::FSUB);
     624             :   setTargetDAGCombine(ISD::FMINNUM);
     625             :   setTargetDAGCombine(ISD::FMAXNUM);
     626             :   setTargetDAGCombine(ISD::SMIN);
     627             :   setTargetDAGCombine(ISD::SMAX);
     628             :   setTargetDAGCombine(ISD::UMIN);
     629             :   setTargetDAGCombine(ISD::UMAX);
     630             :   setTargetDAGCombine(ISD::SETCC);
     631             :   setTargetDAGCombine(ISD::AND);
     632             :   setTargetDAGCombine(ISD::OR);
     633             :   setTargetDAGCombine(ISD::XOR);
     634             :   setTargetDAGCombine(ISD::SINT_TO_FP);
     635             :   setTargetDAGCombine(ISD::UINT_TO_FP);
     636             :   setTargetDAGCombine(ISD::FCANONICALIZE);
     637             :   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
     638             :   setTargetDAGCombine(ISD::ZERO_EXTEND);
     639             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     640             :   setTargetDAGCombine(ISD::BUILD_VECTOR);
     641             : 
     642             :   // All memory operations. Some folding on the pointer operand is done to help
     643             :   // matching the constant offsets in the addressing modes.
     644             :   setTargetDAGCombine(ISD::LOAD);
     645             :   setTargetDAGCombine(ISD::STORE);
     646             :   setTargetDAGCombine(ISD::ATOMIC_LOAD);
     647             :   setTargetDAGCombine(ISD::ATOMIC_STORE);
     648             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
     649             :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     650             :   setTargetDAGCombine(ISD::ATOMIC_SWAP);
     651             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
     652             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
     653             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
     654             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
     655             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
     656             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
     657             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
     658             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
     659             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
     660             :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
     661             : 
     662             :   setSchedulingPreference(Sched::RegPressure);
     663             : 
     664             :   // SI at least has hardware support for floating point exceptions, but no way
     665             :   // of using or handling them is implemented. They are also optional in OpenCL
     666             :   // (Section 7.3)
     667        2271 :   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
     668        2271 : }
     669             : 
     670      987056 : const SISubtarget *SITargetLowering::getSubtarget() const {
     671      987056 :   return Subtarget;
     672             : }
     673             : 
     674             : //===----------------------------------------------------------------------===//
     675             : // TargetLowering queries
     676             : //===----------------------------------------------------------------------===//
     677             : 
     678             : // v_mad_mix* support a conversion from f16 to f32.
     679             : //
     680             : // There is only one special case when denormals are enabled we don't currently,
     681             : // where this is OK to use.
     682          24 : bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
     683             :                                            EVT DestVT, EVT SrcVT) const {
     684          22 :   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
     685           2 :           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
     686          70 :          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
     687          59 :          SrcVT.getScalarType() == MVT::f16;
     688             : }
     689             : 
     690          30 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
     691             :   // SI has some legal vector types, but no legal vector operations. Say no
     692             :   // shuffles are legal in order to prefer scalarizing some vector operations.
     693          30 :   return false;
     694             : }
     695             : 
     696       23102 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     697             :                                           const CallInst &CI,
     698             :                                           MachineFunction &MF,
     699             :                                           unsigned IntrID) const {
     700       23102 :   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
     701       23102 :           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
     702             :     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
     703        1127 :                                                   (Intrinsic::ID)IntrID);
     704        1127 :     if (Attr.hasFnAttribute(Attribute::ReadNone))
     705             :       return false;
     706             : 
     707        1091 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     708             : 
     709        1091 :     if (RsrcIntr->IsImage) {
     710         624 :       Info.ptrVal = MFI->getImagePSV(
     711         624 :         *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     712         624 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     713         624 :       Info.align = 0;
     714             :     } else {
     715         467 :       Info.ptrVal = MFI->getBufferPSV(
     716         467 :         *MF.getSubtarget<SISubtarget>().getInstrInfo(),
     717         467 :         CI.getArgOperand(RsrcIntr->RsrcArg));
     718             :     }
     719             : 
     720        1091 :     Info.flags = MachineMemOperand::MODereferenceable;
     721        1091 :     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
     722         722 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     723         722 :       Info.memVT = MVT::getVT(CI.getType());
     724             :       Info.flags |= MachineMemOperand::MOLoad;
     725         369 :     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
     726         281 :       Info.opc = ISD::INTRINSIC_VOID;
     727         562 :       Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
     728             :       Info.flags |= MachineMemOperand::MOStore;
     729             :     } else {
     730             :       // Atomic
     731          88 :       Info.opc = ISD::INTRINSIC_W_CHAIN;
     732          88 :       Info.memVT = MVT::getVT(CI.getType());
     733             :       Info.flags = MachineMemOperand::MOLoad |
     734             :                    MachineMemOperand::MOStore |
     735             :                    MachineMemOperand::MODereferenceable;
     736             : 
     737             :       // XXX - Should this be volatile without known ordering?
     738             :       Info.flags |= MachineMemOperand::MOVolatile;
     739             :     }
     740             :     return true;
     741             :   }
     742             : 
     743             :   switch (IntrID) {
     744         245 :   case Intrinsic::amdgcn_atomic_inc:
     745             :   case Intrinsic::amdgcn_atomic_dec:
     746             :   case Intrinsic::amdgcn_ds_fadd:
     747             :   case Intrinsic::amdgcn_ds_fmin:
     748             :   case Intrinsic::amdgcn_ds_fmax: {
     749         245 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     750         245 :     Info.memVT = MVT::getVT(CI.getType());
     751         245 :     Info.ptrVal = CI.getOperand(0);
     752         245 :     Info.align = 0;
     753         245 :     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     754             : 
     755             :     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     756         242 :     if (!Vol || !Vol->isZero())
     757             :       Info.flags |= MachineMemOperand::MOVolatile;
     758             : 
     759             :     return true;
     760             :   }
     761             : 
     762             :   default:
     763             :     return false;
     764             :   }
     765             : }
     766             : 
     767       29766 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
     768             :                                             SmallVectorImpl<Value*> &Ops,
     769             :                                             Type *&AccessTy) const {
     770             :   switch (II->getIntrinsicID()) {
     771         269 :   case Intrinsic::amdgcn_atomic_inc:
     772             :   case Intrinsic::amdgcn_atomic_dec:
     773             :   case Intrinsic::amdgcn_ds_fadd:
     774             :   case Intrinsic::amdgcn_ds_fmin:
     775             :   case Intrinsic::amdgcn_ds_fmax: {
     776         538 :     Value *Ptr = II->getArgOperand(0);
     777         269 :     AccessTy = II->getType();
     778         269 :     Ops.push_back(Ptr);
     779             :     return true;
     780             :   }
     781             :   default:
     782             :     return false;
     783             :   }
     784             : }
     785             : 
     786       50663 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
     787       50663 :   if (!Subtarget->hasFlatInstOffsets()) {
     788             :     // Flat instructions do not have offsets, and only have the register
     789             :     // address.
     790       49015 :     return AM.BaseOffs == 0 && AM.Scale == 0;
     791             :   }
     792             : 
     793             :   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
     794             :   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
     795             : 
     796             :   // Just r + i
     797        1648 :   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
     798             : }
     799             : 
     800      109384 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
     801      109384 :   if (Subtarget->hasFlatGlobalInsts())
     802       40454 :     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
     803             : 
     804       89157 :   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
     805             :       // Assume the we will use FLAT for all global memory accesses
     806             :       // on VI.
     807             :       // FIXME: This assumption is currently wrong.  On VI we still use
     808             :       // MUBUF instructions for the r + i addressing mode.  As currently
     809             :       // implemented, the MUBUF instructions only work on buffer < 4GB.
     810             :       // It may be possible to support > 4GB buffers with MUBUF instructions,
     811             :       // by setting the stride value in the resource descriptor which would
     812             :       // increase the size limit to (stride * 4GB).  However, this is risky,
     813             :       // because it has never been validated.
     814       41889 :     return isLegalFlatAddressingMode(AM);
     815             :   }
     816             : 
     817       47268 :   return isLegalMUBUFAddressingMode(AM);
     818             : }
     819             : 
     820       53454 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
     821             :   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     822             :   // additionally can do r + r + i with addr64. 32-bit has more addressing
     823             :   // mode options. Depending on the resource constant, it can also do
     824             :   // (i64 r0) + (i32 r1) * (i14 i).
     825             :   //
     826             :   // Private arrays end up using a scratch buffer most of the time, so also
     827             :   // assume those use MUBUF instructions. Scratch loads / stores are currently
     828             :   // implemented as mubuf instructions with offen bit set, so slightly
     829             :   // different than the normal addr64.
     830       53454 :   if (!isUInt<12>(AM.BaseOffs))
     831             :     return false;
     832             : 
     833             :   // FIXME: Since we can split immediate into soffset and immediate offset,
     834             :   // would it make sense to allow any immediate?
     835             : 
     836       52933 :   switch (AM.Scale) {
     837             :   case 0: // r + i or just i, depending on HasBaseReg.
     838             :     return true;
     839             :   case 1:
     840             :     return true; // We have r + r or r + i.
     841         885 :   case 2:
     842         885 :     if (AM.HasBaseReg) {
     843             :       // Reject 2 * r + r.
     844             :       return false;
     845             :     }
     846             : 
     847             :     // Allow 2 * r as r + r
     848             :     // Or  2 * r + i is allowed as r + r + i.
     849           0 :     return true;
     850       13643 :   default: // Don't allow n * r
     851       13643 :     return false;
     852             :   }
     853             : }
     854             : 
     855      213712 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     856             :                                              const AddrMode &AM, Type *Ty,
     857             :                                              unsigned AS, Instruction *I) const {
     858             :   // No global is ever allowed as a base.
     859      213712 :   if (AM.BaseGV)
     860             :     return false;
     861             : 
     862      211047 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     863       81956 :     return isLegalGlobalAddressingMode(AM);
     864             : 
     865      258182 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
     866      129091 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
     867             :     // If the offset isn't a multiple of 4, it probably isn't going to be
     868             :     // correctly aligned.
     869             :     // FIXME: Can we get the real alignment here?
     870       93339 :     if (AM.BaseOffs % 4 != 0)
     871          99 :       return isLegalMUBUFAddressingMode(AM);
     872             : 
     873             :     // There are no SMRD extloads, so if we have to do a small type access we
     874             :     // will use a MUBUF load.
     875             :     // FIXME?: We also need to do this if unaligned, but we don't know the
     876             :     // alignment here.
     877      186480 :     if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
     878       27428 :       return isLegalGlobalAddressingMode(AM);
     879             : 
     880       65812 :     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     881             :       // SMRD instructions have an 8-bit, dword offset on SI.
     882       19530 :       if (!isUInt<8>(AM.BaseOffs / 4))
     883             :         return false;
     884       46282 :     } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
     885             :       // On CI+, this can also be a 32-bit literal constant offset. If it fits
     886             :       // in 8-bits, it can use a smaller encoding.
     887        9150 :       if (!isUInt<32>(AM.BaseOffs / 4))
     888             :         return false;
     889       37132 :     } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     890             :       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
     891       37132 :       if (!isUInt<20>(AM.BaseOffs))
     892             :         return false;
     893             :     } else
     894           0 :       llvm_unreachable("unhandled generation");
     895             : 
     896       65671 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     897             :       return true;
     898             : 
     899         379 :     if (AM.Scale == 1 && AM.HasBaseReg)
     900             :       return true;
     901             : 
     902         379 :     return false;
     903             : 
     904       35752 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     905        6087 :     return isLegalMUBUFAddressingMode(AM);
     906       38441 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
     907        8776 :              AS == AMDGPUASI.REGION_ADDRESS) {
     908             :     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     909             :     // field.
     910             :     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
     911             :     // an 8-bit dword offset but we don't know the alignment here.
     912       20891 :     if (!isUInt<16>(AM.BaseOffs))
     913             :       return false;
     914             : 
     915       19105 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     916             :       return true;
     917             : 
     918        3752 :     if (AM.Scale == 1 && AM.HasBaseReg)
     919             :       return true;
     920             : 
     921        2392 :     return false;
     922        8774 :   } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
     923             :              AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     924             :     // For an unknown address space, this usually means that this is for some
     925             :     // reason being used for pure arithmetic, and not based on some addressing
     926             :     // computation. We don't have instructions that compute pointers with any
     927             :     // addressing modes, so treat them as having no offset like flat
     928             :     // instructions.
     929        8774 :     return isLegalFlatAddressingMode(AM);
     930             :   } else {
     931           0 :     llvm_unreachable("unhandled address space");
     932             :   }
     933             : }
     934             : 
     935       15583 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
     936             :                                         const SelectionDAG &DAG) const {
     937       15583 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
     938        7251 :     return (MemVT.getSizeInBits() <= 4 * 32);
     939        8332 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     940        3392 :     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
     941        3392 :     return (MemVT.getSizeInBits() <= MaxPrivateBits);
     942        4940 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     943        4940 :     return (MemVT.getSizeInBits() <= 2 * 32);
     944             :   }
     945             :   return true;
     946             : }
     947             : 
     948      124793 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     949             :                                                       unsigned AddrSpace,
     950             :                                                       unsigned Align,
     951             :                                                       bool *IsFast) const {
     952      124793 :   if (IsFast)
     953       81721 :     *IsFast = false;
     954             : 
     955             :   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
     956             :   // which isn't a simple VT.
     957             :   // Until MVT is extended to handle this, simply check for the size and
     958             :   // rely on the condition below: allow accesses if the size is a multiple of 4.
     959      124793 :   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
     960             :                            VT.getStoreSize() > 16)) {
     961             :     return false;
     962             :   }
     963             : 
     964      241485 :   if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
     965      116692 :       AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     966             :     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     967             :     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     968             :     // with adjacent offsets.
     969        8101 :     bool AlignedBy4 = (Align % 4 == 0);
     970        8101 :     if (IsFast)
     971        5926 :       *IsFast = AlignedBy4;
     972             : 
     973             :     return AlignedBy4;
     974             :   }
     975             : 
     976             :   // FIXME: We have to be conservative here and assume that flat operations
     977             :   // will access scratch.  If we had access to the IR function, then we
     978             :   // could determine if any private memory was used in the function.
     979      233352 :   if (!Subtarget->hasUnalignedScratchAccess() &&
     980      232755 :       (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
     981      116095 :        AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     982             :     return false;
     983             :   }
     984             : 
     985      116103 :   if (Subtarget->hasUnalignedBufferAccess()) {
     986             :     // If we have an uniform constant load, it still requires using a slow
     987             :     // buffer instruction if unaligned.
     988        6427 :     if (IsFast) {
     989       12624 :       *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
     990        9020 :                  AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
     991         604 :         (Align % 4 == 0) : true;
     992             :     }
     993             : 
     994             :     return true;
     995             :   }
     996             : 
     997             :   // Smaller than dword value must be aligned.
     998      109676 :   if (VT.bitsLT(MVT::i32))
     999             :     return false;
    1000             : 
    1001             :   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
    1002             :   // byte-address are ignored, thus forcing Dword alignment.
    1003             :   // This applies to private, global, and constant memory.
    1004      107910 :   if (IsFast)
    1005       69487 :     *IsFast = true;
    1006             : 
    1007      107910 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1008             : }
    1009             : 
    1010         112 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
    1011             :                                           unsigned SrcAlign, bool IsMemset,
    1012             :                                           bool ZeroMemset,
    1013             :                                           bool MemcpyStrSrc,
    1014             :                                           MachineFunction &MF) const {
    1015             :   // FIXME: Should account for address space here.
    1016             : 
    1017             :   // The default fallback uses the private pointer size as a guess for a type to
    1018             :   // use. Make sure we switch these to 64-bit accesses.
    1019             : 
    1020         112 :   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
    1021          86 :     return MVT::v4i32;
    1022             : 
    1023          26 :   if (Size >= 8 && DstAlign >= 4)
    1024           8 :     return MVT::v2i32;
    1025             : 
    1026             :   // Use the default.
    1027          18 :   return MVT::Other;
    1028             : }
    1029             : 
    1030             : static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
    1031         514 :   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
    1032             :          AS == AMDGPUASI.FLAT_ADDRESS ||
    1033         926 :          AS == AMDGPUASI.CONSTANT_ADDRESS ||
    1034         212 :          AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
    1035             : }
    1036             : 
    1037         243 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
    1038             :                                            unsigned DestAS) const {
    1039         243 :   return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
    1040         243 :          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
    1041             : }
    1042             : 
    1043        3992 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
    1044             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1045        3992 :   const Value *Ptr = MemNode->getMemOperand()->getValue();
    1046             :   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
    1047        6608 :   return I && I->getMetadata("amdgpu.noclobber");
    1048             : }
    1049             : 
    1050          93 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
    1051             :                                             unsigned DestAS) const {
    1052             :   // Flat -> private/local is a simple truncate.
    1053             :   // Flat -> global is no-op
    1054          93 :   if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
    1055             :     return true;
    1056             : 
    1057          29 :   return isNoopAddrSpaceCast(SrcAS, DestAS);
    1058             : }
    1059             : 
    1060           0 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
    1061             :   const MemSDNode *MemNode = cast<MemSDNode>(N);
    1062             : 
    1063           0 :   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
    1064             : }
    1065             : 
    1066             : TargetLoweringBase::LegalizeTypeAction
    1067      188611 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
    1068      343157 :   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
    1069             :     return TypeSplitVector;
    1070             : 
    1071             :   return TargetLoweringBase::getPreferredVectorAction(VT);
    1072             : }
    1073             : 
    1074          32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    1075             :                                                          Type *Ty) const {
    1076             :   // FIXME: Could be smarter if called for vector constants.
    1077          32 :   return true;
    1078             : }
    1079             : 
    1080      276536 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
    1081      276536 :   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
    1082       27591 :     switch (Op) {
    1083             :     case ISD::LOAD:
    1084             :     case ISD::STORE:
    1085             : 
    1086             :     // These operations are done with 32-bit instructions anyway.
    1087             :     case ISD::AND:
    1088             :     case ISD::OR:
    1089             :     case ISD::XOR:
    1090             :     case ISD::SELECT:
    1091             :       // TODO: Extensions?
    1092             :       return true;
    1093       24117 :     default:
    1094       24117 :       return false;
    1095             :     }
    1096             :   }
    1097             : 
    1098             :   // SimplifySetCC uses this function to determine whether or not it should
    1099             :   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
    1100         684 :   if (VT == MVT::i1 && Op == ISD::SETCC)
    1101             :     return false;
    1102             : 
    1103             :   return TargetLowering::isTypeDesirableForOp(Op, VT);
    1104             : }
    1105             : 
    1106       38911 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
    1107             :                                                    const SDLoc &SL,
    1108             :                                                    SDValue Chain,
    1109             :                                                    uint64_t Offset) const {
    1110       38911 :   const DataLayout &DL = DAG.getDataLayout();
    1111       38911 :   MachineFunction &MF = DAG.getMachineFunction();
    1112       38911 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1113             : 
    1114             :   const ArgDescriptor *InputPtrReg;
    1115             :   const TargetRegisterClass *RC;
    1116             : 
    1117             :   std::tie(InputPtrReg, RC)
    1118             :     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1119             : 
    1120       38911 :   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    1121             :   MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
    1122             :   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
    1123       38911 :     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
    1124             : 
    1125       38911 :   return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
    1126             : }
    1127             : 
    1128          42 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
    1129             :                                             const SDLoc &SL) const {
    1130          42 :   uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
    1131          42 :                                                FIRST_IMPLICIT);
    1132          42 :   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
    1133             : }
    1134             : 
    1135       38869 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
    1136             :                                          const SDLoc &SL, SDValue Val,
    1137             :                                          bool Signed,
    1138             :                                          const ISD::InputArg *Arg) const {
    1139      116465 :   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
    1140          78 :       VT.bitsLT(MemVT)) {
    1141           9 :     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
    1142           9 :     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
    1143             :   }
    1144             : 
    1145       38869 :   if (MemVT.isFloatingPoint())
    1146        2369 :     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
    1147       36500 :   else if (Signed)
    1148          14 :     Val = DAG.getSExtOrTrunc(Val, SL, VT);
    1149             :   else
    1150       36486 :     Val = DAG.getZExtOrTrunc(Val, SL, VT);
    1151             : 
    1152       38869 :   return Val;
    1153             : }
    1154             : 
    1155       38869 : SDValue SITargetLowering::lowerKernargMemParameter(
    1156             :   SelectionDAG &DAG, EVT VT, EVT MemVT,
    1157             :   const SDLoc &SL, SDValue Chain,
    1158             :   uint64_t Offset, unsigned Align, bool Signed,
    1159             :   const ISD::InputArg *Arg) const {
    1160       38869 :   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
    1161       38869 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    1162       38869 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    1163             : 
    1164             :   // Try to avoid using an extload by loading earlier than the argument address,
    1165             :   // and extracting the relevant bits. The load should hopefully be merged with
    1166             :   // the previous argument.
    1167       38869 :   if (Align < 4) {
    1168             :     assert(MemVT.getStoreSize() < 4);
    1169             :     int64_t AlignDownOffset = alignDown(Offset, 4);
    1170        1234 :     int64_t OffsetDiff = Offset - AlignDownOffset;
    1171             : 
    1172        1234 :     EVT IntVT = MemVT.changeTypeToInteger();
    1173             : 
    1174             :     // TODO: If we passed in the base kernel offset we could have a better
    1175             :     // alignment than 4, but we don't really need it.
    1176        1234 :     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
    1177             :     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
    1178             :                                MachineMemOperand::MODereferenceable |
    1179        1234 :                                MachineMemOperand::MOInvariant);
    1180             : 
    1181        1234 :     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
    1182        1234 :     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
    1183             : 
    1184        1234 :     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
    1185        1234 :     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
    1186        1234 :     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
    1187             : 
    1188             : 
    1189        2468 :     return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
    1190             :   }
    1191             : 
    1192       37635 :   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
    1193             :   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
    1194             :                              MachineMemOperand::MODereferenceable |
    1195       37635 :                              MachineMemOperand::MOInvariant);
    1196             : 
    1197       37635 :   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
    1198       75270 :   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
    1199             : }
    1200             : 
    1201         216 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
    1202             :                                               const SDLoc &SL, SDValue Chain,
    1203             :                                               const ISD::InputArg &Arg) const {
    1204         216 :   MachineFunction &MF = DAG.getMachineFunction();
    1205         216 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1206             : 
    1207         216 :   if (Arg.Flags.isByVal()) {
    1208          67 :     unsigned Size = Arg.Flags.getByValSize();
    1209          67 :     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
    1210          67 :     return DAG.getFrameIndex(FrameIdx, MVT::i32);
    1211             :   }
    1212             : 
    1213         149 :   unsigned ArgOffset = VA.getLocMemOffset();
    1214         298 :   unsigned ArgSize = VA.getValVT().getStoreSize();
    1215             : 
    1216         149 :   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
    1217             : 
    1218             :   // Create load nodes to retrieve arguments from the stack.
    1219         149 :   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
    1220             :   SDValue ArgValue;
    1221             : 
    1222             :   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
    1223             :   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
    1224             :   MVT MemVT = VA.getValVT();
    1225             : 
    1226         149 :   switch (VA.getLocInfo()) {
    1227             :   default:
    1228             :     break;
    1229           0 :   case CCValAssign::BCvt:
    1230             :     MemVT = VA.getLocVT();
    1231           0 :     break;
    1232           0 :   case CCValAssign::SExt:
    1233             :     ExtType = ISD::SEXTLOAD;
    1234           0 :     break;
    1235           0 :   case CCValAssign::ZExt:
    1236             :     ExtType = ISD::ZEXTLOAD;
    1237           0 :     break;
    1238           3 :   case CCValAssign::AExt:
    1239             :     ExtType = ISD::EXTLOAD;
    1240           3 :     break;
    1241             :   }
    1242             : 
    1243         149 :   ArgValue = DAG.getExtLoad(
    1244             :     ExtType, SL, VA.getLocVT(), Chain, FIN,
    1245             :     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
    1246         298 :     MemVT);
    1247         149 :   return ArgValue;
    1248             : }
    1249             : 
    1250       13055 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
    1251             :   const SIMachineFunctionInfo &MFI,
    1252             :   EVT VT,
    1253             :   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
    1254             :   const ArgDescriptor *Reg;
    1255             :   const TargetRegisterClass *RC;
    1256             : 
    1257             :   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
    1258       13055 :   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
    1259             : }
    1260             : 
    1261        1252 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
    1262             :                                    CallingConv::ID CallConv,
    1263             :                                    ArrayRef<ISD::InputArg> Ins,
    1264             :                                    BitVector &Skipped,
    1265             :                                    FunctionType *FType,
    1266             :                                    SIMachineFunctionInfo *Info) {
    1267        6668 :   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    1268        5416 :     const ISD::InputArg &Arg = Ins[I];
    1269             : 
    1270             :     // First check if it's a PS input addr.
    1271        8116 :     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
    1272        8673 :         !Arg.Flags.isByVal() && PSInputNum <= 15) {
    1273             : 
    1274        5292 :       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
    1275             :         // We can safely skip PS inputs.
    1276             :         Skipped.set(I);
    1277         960 :         ++PSInputNum;
    1278         960 :         continue;
    1279             :       }
    1280             : 
    1281             :       Info->markPSInputAllocated(PSInputNum);
    1282        2297 :       if (Arg.Used)
    1283             :         Info->markPSInputEnabled(PSInputNum);
    1284             : 
    1285        2297 :       ++PSInputNum;
    1286             :     }
    1287             : 
    1288             :     // Second split vertices into their elements.
    1289        4456 :     if (Arg.VT.isVector()) {
    1290        1336 :       ISD::InputArg NewArg = Arg;
    1291             :       NewArg.Flags.setSplit();
    1292        1336 :       NewArg.VT = Arg.VT.getVectorElementType();
    1293             : 
    1294             :       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
    1295             :       // three or five element vertex only needs three or five registers,
    1296             :       // NOT four or eight.
    1297        1336 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1298             :       unsigned NumElements = ParamType->getVectorNumElements();
    1299             : 
    1300       15564 :       for (unsigned J = 0; J != NumElements; ++J) {
    1301        7114 :         Splits.push_back(NewArg);
    1302        7114 :         NewArg.PartOffset += NewArg.VT.getStoreSize();
    1303             :       }
    1304             :     } else {
    1305        3120 :       Splits.push_back(Arg);
    1306             :     }
    1307             :   }
    1308        1252 : }
    1309             : 
    1310             : // Allocate special inputs passed in VGPRs.
    1311       16447 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
    1312             :                                            MachineFunction &MF,
    1313             :                                            const SIRegisterInfo &TRI,
    1314             :                                            SIMachineFunctionInfo &Info) {
    1315       16447 :   if (Info.hasWorkItemIDX()) {
    1316             :     unsigned Reg = AMDGPU::VGPR0;
    1317       15195 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1318             : 
    1319       15195 :     CCInfo.AllocateReg(Reg);
    1320             :     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
    1321             :   }
    1322             : 
    1323       16447 :   if (Info.hasWorkItemIDY()) {
    1324             :     unsigned Reg = AMDGPU::VGPR1;
    1325         137 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1326             : 
    1327         137 :     CCInfo.AllocateReg(Reg);
    1328             :     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
    1329             :   }
    1330             : 
    1331       16447 :   if (Info.hasWorkItemIDZ()) {
    1332             :     unsigned Reg = AMDGPU::VGPR2;
    1333          76 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1334             : 
    1335          76 :     CCInfo.AllocateReg(Reg);
    1336             :     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
    1337             :   }
    1338       16447 : }
    1339             : 
    1340             : // Try to allocate a VGPR at the end of the argument list, or if no argument
    1341             : // VGPRs are left allocating a stack slot.
    1342          36 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
    1343             :   ArrayRef<MCPhysReg> ArgVGPRs
    1344          36 :     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    1345             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
    1346          36 :   if (RegIdx == ArgVGPRs.size()) {
    1347             :     // Spill to stack required.
    1348           8 :     int64_t Offset = CCInfo.AllocateStack(4, 4);
    1349             : 
    1350             :     return ArgDescriptor::createStack(Offset);
    1351             :   }
    1352             : 
    1353          28 :   unsigned Reg = ArgVGPRs[RegIdx];
    1354          28 :   Reg = CCInfo.AllocateReg(Reg);
    1355             :   assert(Reg != AMDGPU::NoRegister);
    1356             : 
    1357          28 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1358          28 :   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1359             :   return ArgDescriptor::createRegister(Reg);
    1360             : }
    1361             : 
    1362         119 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
    1363             :                                              const TargetRegisterClass *RC,
    1364             :                                              unsigned NumArgRegs) {
    1365         119 :   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
    1366             :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
    1367         119 :   if (RegIdx == ArgSGPRs.size())
    1368           0 :     report_fatal_error("ran out of SGPRs for arguments");
    1369             : 
    1370         119 :   unsigned Reg = ArgSGPRs[RegIdx];
    1371         119 :   Reg = CCInfo.AllocateReg(Reg);
    1372             :   assert(Reg != AMDGPU::NoRegister);
    1373             : 
    1374         119 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1375         119 :   MF.addLiveIn(Reg, RC);
    1376         119 :   return ArgDescriptor::createRegister(Reg);
    1377             : }
    1378             : 
    1379             : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
    1380          62 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
    1381             : }
    1382             : 
    1383             : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
    1384          57 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
    1385             : }
    1386             : 
    1387        1407 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
    1388             :                                       MachineFunction &MF,
    1389             :                                       const SIRegisterInfo &TRI,
    1390             :                                       SIMachineFunctionInfo &Info) {
    1391        1407 :   if (Info.hasWorkItemIDX())
    1392          18 :     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
    1393             : 
    1394        1407 :   if (Info.hasWorkItemIDY())
    1395          10 :     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
    1396             : 
    1397        1407 :   if (Info.hasWorkItemIDZ())
    1398           8 :     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
    1399        1407 : }
    1400             : 
    1401        1407 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
    1402             :                                       MachineFunction &MF,
    1403             :                                       const SIRegisterInfo &TRI,
    1404             :                                       SIMachineFunctionInfo &Info) {
    1405             :   auto &ArgInfo = Info.getArgInfo();
    1406             : 
    1407             :   // TODO: Unify handling with private memory pointers.
    1408             : 
    1409        1407 :   if (Info.hasDispatchPtr())
    1410          10 :     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
    1411             : 
    1412        1407 :   if (Info.hasQueuePtr())
    1413          11 :     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
    1414             : 
    1415        1407 :   if (Info.hasKernargSegmentPtr())
    1416          14 :     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
    1417             : 
    1418        1407 :   if (Info.hasDispatchID())
    1419          10 :     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
    1420             : 
    1421             :   // flat_scratch_init is not applicable for non-kernel functions.
    1422             : 
    1423        1407 :   if (Info.hasWorkGroupIDX())
    1424          22 :     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
    1425             : 
    1426        1407 :   if (Info.hasWorkGroupIDY())
    1427          20 :     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
    1428             : 
    1429        1407 :   if (Info.hasWorkGroupIDZ())
    1430          20 :     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
    1431             : 
    1432        1407 :   if (Info.hasImplicitArgPtr())
    1433          12 :     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
    1434        1407 : }
    1435             : 
    1436             : // Allocate special inputs passed in user SGPRs.
    1437       16447 : static void allocateHSAUserSGPRs(CCState &CCInfo,
    1438             :                                  MachineFunction &MF,
    1439             :                                  const SIRegisterInfo &TRI,
    1440             :                                  SIMachineFunctionInfo &Info) {
    1441       16447 :   if (Info.hasImplicitBufferPtr()) {
    1442           2 :     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
    1443           2 :     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    1444           2 :     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    1445             :   }
    1446             : 
    1447             :   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    1448       16447 :   if (Info.hasPrivateSegmentBuffer()) {
    1449        2398 :     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    1450        2398 :     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    1451        2398 :     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    1452             :   }
    1453             : 
    1454       16447 :   if (Info.hasDispatchPtr()) {
    1455          42 :     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
    1456          42 :     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    1457          42 :     CCInfo.AllocateReg(DispatchPtrReg);
    1458             :   }
    1459             : 
    1460       16447 :   if (Info.hasQueuePtr()) {
    1461          57 :     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
    1462          57 :     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    1463          57 :     CCInfo.AllocateReg(QueuePtrReg);
    1464             :   }
    1465             : 
    1466       16447 :   if (Info.hasKernargSegmentPtr()) {
    1467       14261 :     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
    1468       14261 :     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
    1469       14261 :     CCInfo.AllocateReg(InputPtrReg);
    1470             :   }
    1471             : 
    1472       16447 :   if (Info.hasDispatchID()) {
    1473           5 :     unsigned DispatchIDReg = Info.addDispatchID(TRI);
    1474           5 :     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    1475           5 :     CCInfo.AllocateReg(DispatchIDReg);
    1476             :   }
    1477             : 
    1478       16447 :   if (Info.hasFlatScratchInit()) {
    1479         357 :     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    1480         357 :     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    1481         357 :     CCInfo.AllocateReg(FlatScratchInitReg);
    1482             :   }
    1483             : 
    1484             :   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    1485             :   // these from the dispatch pointer.
    1486       16447 : }
    1487             : 
    1488             : // Allocate special input registers that are initialized per-wave.
    1489       16447 : static void allocateSystemSGPRs(CCState &CCInfo,
    1490             :                                 MachineFunction &MF,
    1491             :                                 SIMachineFunctionInfo &Info,
    1492             :                                 CallingConv::ID CallConv,
    1493             :                                 bool IsShader) {
    1494       16447 :   if (Info.hasWorkGroupIDX()) {
    1495       15195 :     unsigned Reg = Info.addWorkGroupIDX();
    1496       15195 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1497       15195 :     CCInfo.AllocateReg(Reg);
    1498             :   }
    1499             : 
    1500       16447 :   if (Info.hasWorkGroupIDY()) {
    1501          24 :     unsigned Reg = Info.addWorkGroupIDY();
    1502          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1503          24 :     CCInfo.AllocateReg(Reg);
    1504             :   }
    1505             : 
    1506       16447 :   if (Info.hasWorkGroupIDZ()) {
    1507          24 :     unsigned Reg = Info.addWorkGroupIDZ();
    1508          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1509          24 :     CCInfo.AllocateReg(Reg);
    1510             :   }
    1511             : 
    1512       16447 :   if (Info.hasWorkGroupInfo()) {
    1513           0 :     unsigned Reg = Info.addWorkGroupInfo();
    1514           0 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1515           0 :     CCInfo.AllocateReg(Reg);
    1516             :   }
    1517             : 
    1518       16447 :   if (Info.hasPrivateSegmentWaveByteOffset()) {
    1519             :     // Scratch wave offset passed in system SGPR.
    1520             :     unsigned PrivateSegmentWaveByteOffsetReg;
    1521             : 
    1522       15245 :     if (IsShader) {
    1523             :       PrivateSegmentWaveByteOffsetReg =
    1524             :         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
    1525             : 
    1526             :       // This is true if the scratch wave byte offset doesn't have a fixed
    1527             :       // location.
    1528          50 :       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
    1529             :         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
    1530             :         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
    1531             :       }
    1532             :     } else
    1533       15195 :       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
    1534             : 
    1535       15245 :     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
    1536       15245 :     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    1537             :   }
    1538       16447 : }
    1539             : 
    1540       16470 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
    1541             :                                      MachineFunction &MF,
    1542             :                                      const SIRegisterInfo &TRI,
    1543             :                                      SIMachineFunctionInfo &Info) {
    1544             :   // Now that we've figured out where the scratch register inputs are, see if
    1545             :   // should reserve the arguments and use them directly.
    1546       16470 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1547             :   bool HasStackObjects = MFI.hasStackObjects();
    1548             : 
    1549             :   // Record that we know we have non-spill stack objects so we don't need to
    1550             :   // check all stack objects later.
    1551       16470 :   if (HasStackObjects)
    1552             :     Info.setHasNonSpillStackObjects(true);
    1553             : 
    1554             :   // Everything live out of a block is spilled with fast regalloc, so it's
    1555             :   // almost certain that spilling will be required.
    1556       16470 :   if (TM.getOptLevel() == CodeGenOpt::None)
    1557             :     HasStackObjects = true;
    1558             : 
    1559             :   // For now assume stack access is needed in any callee functions, so we need
    1560             :   // the scratch registers to pass in.
    1561       16287 :   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
    1562             : 
    1563       16470 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1564       16470 :   if (ST.isAmdCodeObjectV2(MF.getFunction())) {
    1565        2401 :     if (RequiresStackAccess) {
    1566             :       // If we have stack objects, we unquestionably need the private buffer
    1567             :       // resource. For the Code Object V2 ABI, this will be the first 4 user
    1568             :       // SGPR inputs. We can reserve those and use them directly.
    1569             : 
    1570             :       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
    1571             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
    1572             :       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
    1573             : 
    1574         462 :       if (MFI.hasCalls()) {
    1575             :         // If we have calls, we need to keep the frame register in a register
    1576             :         // that won't be clobbered by a call, so ensure it is copied somewhere.
    1577             : 
    1578             :         // This is not a problem for the scratch wave offset, because the same
    1579             :         // registers are reserved in all functions.
    1580             : 
    1581             :         // FIXME: Nothing is really ensuring this is a call preserved register,
    1582             :         // it's just selected from the end so it happens to be.
    1583             :         unsigned ReservedOffsetReg
    1584         245 :           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1585             :         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1586             :       } else {
    1587             :         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
    1588             :           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1589             :         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
    1590             :       }
    1591             :     } else {
    1592             :       unsigned ReservedBufferReg
    1593        1939 :         = TRI.reservedPrivateSegmentBufferReg(MF);
    1594             :       unsigned ReservedOffsetReg
    1595        1939 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1596             : 
    1597             :       // We tentatively reserve the last registers (skipping the last two
    1598             :       // which may contain VCC). After register allocation, we'll replace
    1599             :       // these with the ones immediately after those which were really
    1600             :       // allocated. In the prologue copies will be inserted from the argument
    1601             :       // to these reserved registers.
    1602             :       Info.setScratchRSrcReg(ReservedBufferReg);
    1603             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1604             :     }
    1605             :   } else {
    1606       14069 :     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
    1607             : 
    1608             :     // Without HSA, relocations are used for the scratch pointer and the
    1609             :     // buffer resource setup is always inserted in the prologue. Scratch wave
    1610             :     // offset is still in an input SGPR.
    1611             :     Info.setScratchRSrcReg(ReservedBufferReg);
    1612             : 
    1613       14069 :     if (HasStackObjects && !MFI.hasCalls()) {
    1614             :       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
    1615             :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1616             :       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
    1617             :     } else {
    1618             :       unsigned ReservedOffsetReg
    1619       13811 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1620             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1621             :     }
    1622             :   }
    1623       16470 : }
    1624             : 
    1625       17674 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
    1626       17674 :   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1627       17674 :   return !Info->isEntryFunction();
    1628             : }
    1629             : 
    1630        1407 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
    1631             : 
    1632        1407 : }
    1633             : 
    1634        1407 : void SITargetLowering::insertCopiesSplitCSR(
    1635             :   MachineBasicBlock *Entry,
    1636             :   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
    1637        1407 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1638             : 
    1639        1407 :   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
    1640        1407 :   if (!IStart)
    1641        1407 :     return;
    1642             : 
    1643           0 :   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    1644           0 :   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
    1645           0 :   MachineBasicBlock::iterator MBBI = Entry->begin();
    1646           0 :   for (const MCPhysReg *I = IStart; *I; ++I) {
    1647             :     const TargetRegisterClass *RC = nullptr;
    1648           0 :     if (AMDGPU::SReg_64RegClass.contains(*I))
    1649             :       RC = &AMDGPU::SGPR_64RegClass;
    1650           0 :     else if (AMDGPU::SReg_32RegClass.contains(*I))
    1651             :       RC = &AMDGPU::SGPR_32RegClass;
    1652             :     else
    1653           0 :       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1654             : 
    1655           0 :     unsigned NewVR = MRI->createVirtualRegister(RC);
    1656             :     // Create copy from CSR to a virtual register.
    1657           0 :     Entry->addLiveIn(*I);
    1658           0 :     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
    1659           0 :       .addReg(*I);
    1660             : 
    1661             :     // Insert the copy-back instructions right before the terminator.
    1662           0 :     for (auto *Exit : Exits)
    1663           0 :       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
    1664           0 :               TII->get(TargetOpcode::COPY), *I)
    1665           0 :         .addReg(NewVR);
    1666             :   }
    1667             : }
    1668             : 
    1669       17857 : SDValue SITargetLowering::LowerFormalArguments(
    1670             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1671             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1672             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1673       17857 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1674             : 
    1675       17857 :   MachineFunction &MF = DAG.getMachineFunction();
    1676       17857 :   const Function &Fn = MF.getFunction();
    1677             :   FunctionType *FType = MF.getFunction().getFunctionType();
    1678       17857 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1679       17857 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1680             : 
    1681       35714 :   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
    1682             :     DiagnosticInfoUnsupported NoGraphicsHSA(
    1683           6 :         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    1684           3 :     DAG.getContext()->diagnose(NoGraphicsHSA);
    1685             :     return DAG.getEntryNode();
    1686             :   }
    1687             : 
    1688             :   // Create stack objects that are used for emitting debugger prologue if
    1689             :   // "amdgpu-debugger-emit-prologue" attribute was specified.
    1690       17854 :   if (ST.debuggerEmitPrologue())
    1691           4 :     createDebuggerPrologueStackObjects(MF);
    1692             : 
    1693             :   SmallVector<ISD::InputArg, 16> Splits;
    1694             :   SmallVector<CCValAssign, 16> ArgLocs;
    1695       17854 :   BitVector Skipped(Ins.size());
    1696             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1697       35708 :                  *DAG.getContext());
    1698             : 
    1699       17854 :   bool IsShader = AMDGPU::isShader(CallConv);
    1700             :   bool IsKernel = AMDGPU::isKernel(CallConv);
    1701       17854 :   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
    1702             : 
    1703       17854 :   if (!IsEntryFunc) {
    1704             :     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
    1705             :     // this when allocating argument fixed offsets.
    1706        1407 :     CCInfo.AllocateStack(4, 4);
    1707             :   }
    1708             : 
    1709       17854 :   if (IsShader) {
    1710        1252 :     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
    1711             : 
    1712             :     // At least one interpolation mode must be enabled or else the GPU will
    1713             :     // hang.
    1714             :     //
    1715             :     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    1716             :     // set PSInputAddr, the user wants to enable some bits after the compilation
    1717             :     // based on run-time states. Since we can't know what the final PSInputEna
    1718             :     // will look like, so we shouldn't do anything here and the user should take
    1719             :     // responsibility for the correct programming.
    1720             :     //
    1721             :     // Otherwise, the following restrictions apply:
    1722             :     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    1723             :     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    1724             :     //   enabled too.
    1725        1252 :     if (CallConv == CallingConv::AMDGPU_PS) {
    1726        1827 :       if ((Info->getPSInputAddr() & 0x7F) == 0 ||
    1727         828 :            ((Info->getPSInputAddr() & 0xF) == 0 &&
    1728             :             Info->isPSInputAllocated(11))) {
    1729             :         CCInfo.AllocateReg(AMDGPU::VGPR0);
    1730             :         CCInfo.AllocateReg(AMDGPU::VGPR1);
    1731             :         Info->markPSInputAllocated(0);
    1732             :         Info->markPSInputEnabled(0);
    1733             :       }
    1734        2004 :       if (Subtarget->isAmdPalOS()) {
    1735             :         // For isAmdPalOS, the user does not enable some bits after compilation
    1736             :         // based on run-time states; the register values being generated here are
    1737             :         // the final ones set in hardware. Therefore we need to apply the
    1738             :         // workaround to PSInputAddr and PSInputEnable together.  (The case where
    1739             :         // a bit is set in PSInputAddr but not PSInputEnable is where the
    1740             :         // frontend set up an input arg for a particular interpolation mode, but
    1741             :         // nothing uses that input arg. Really we should have an earlier pass
    1742             :         // that removes such an arg.)
    1743          10 :         unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
    1744          17 :         if ((PsInputBits & 0x7F) == 0 ||
    1745           7 :             ((PsInputBits & 0xF) == 0 &&
    1746             :              (PsInputBits >> 11 & 1)))
    1747           3 :           Info->markPSInputEnabled(
    1748             :               countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    1749             :       }
    1750             :     }
    1751             : 
    1752             :     assert(!Info->hasDispatchPtr() &&
    1753             :            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
    1754             :            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
    1755             :            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
    1756             :            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
    1757             :            !Info->hasWorkItemIDZ());
    1758       16602 :   } else if (IsKernel) {
    1759             :     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
    1760             :   } else {
    1761        1407 :     Splits.append(Ins.begin(), Ins.end());
    1762             :   }
    1763             : 
    1764       17854 :   if (IsEntryFunc) {
    1765       16447 :     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    1766       16447 :     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
    1767             :   }
    1768             : 
    1769       17854 :   if (IsKernel) {
    1770       15195 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1771             :   } else {
    1772        2659 :     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
    1773        2659 :     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
    1774             :   }
    1775             : 
    1776             :   SmallVector<SDValue, 16> Chains;
    1777             : 
    1778             :   // FIXME: This is the minimum kernel argument alignment. We should improve
    1779             :   // this to the maximum alignment of the arguments.
    1780             :   //
    1781             :   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
    1782             :   // kern arg offset.
    1783             :   const unsigned KernelArgBaseAlign = 16;
    1784       17854 :   const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
    1785             : 
    1786       65589 :    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    1787       47735 :     const ISD::InputArg &Arg = Ins[i];
    1788       48695 :     if (Skipped[i]) {
    1789        1920 :       InVals.push_back(DAG.getUNDEF(Arg.VT));
    1790       42238 :       continue;
    1791             :     }
    1792             : 
    1793       46775 :     CCValAssign &VA = ArgLocs[ArgIdx++];
    1794             :     MVT VT = VA.getLocVT();
    1795             : 
    1796       89997 :     if (IsEntryFunc && VA.isMemLoc()) {
    1797       38766 :       VT = Ins[i].VT;
    1798             :       EVT MemVT = VA.getLocVT();
    1799             : 
    1800       38766 :       const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
    1801       38766 :       unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
    1802             : 
    1803             :       // The first 36 bytes of the input buffer contains information about
    1804             :       // thread group and global sizes for clover.
    1805             :       SDValue Arg = lowerKernargMemParameter(
    1806       77532 :         DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
    1807       38766 :       Chains.push_back(Arg.getValue(1));
    1808             : 
    1809             :       auto *ParamTy =
    1810       38766 :         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
    1811       53296 :       if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    1812       47376 :           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    1813             :         // On SI local pointers are just offsets into LDS, so they are always
    1814             :         // less than 16-bits.  On CI and newer they could potentially be
    1815             :         // real pointers, so we can't guarantee their size.
    1816         640 :         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
    1817        1280 :                           DAG.getValueType(MVT::i16));
    1818             :       }
    1819             : 
    1820       38766 :       InVals.push_back(Arg);
    1821       38766 :       continue;
    1822       11562 :     } else if (!IsEntryFunc && VA.isMemLoc()) {
    1823         216 :       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
    1824         216 :       InVals.push_back(Val);
    1825         216 :       if (!Arg.Flags.isByVal())
    1826         149 :         Chains.push_back(Val.getValue(1));
    1827         216 :       continue;
    1828             :     }
    1829             : 
    1830             :     assert(VA.isRegLoc() && "Parameter must be in a register!");
    1831             : 
    1832        7793 :     unsigned Reg = VA.getLocReg();
    1833        7793 :     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    1834             :     EVT ValVT = VA.getValVT();
    1835             : 
    1836        7793 :     Reg = MF.addLiveIn(Reg, RC);
    1837        7793 :     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1838             : 
    1839        7805 :     if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
    1840             :       // The return object should be reasonably addressable.
    1841             : 
    1842             :       // FIXME: This helps when the return is a real sret. If it is a
    1843             :       // automatically inserted sret (i.e. CanLowerReturn returns false), an
    1844             :       // extra copy is inserted in SelectionDAGBuilder which obscures this.
    1845          12 :       unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
    1846          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1847          24 :         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
    1848             :     }
    1849             : 
    1850             :     // If this is an 8 or 16-bit value, it is really passed promoted
    1851             :     // to 32 bits. Insert an assert[sz]ext to capture this, then
    1852             :     // truncate to the right size.
    1853        7793 :     switch (VA.getLocInfo()) {
    1854             :     case CCValAssign::Full:
    1855             :       break;
    1856             :     case CCValAssign::BCvt:
    1857           0 :       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
    1858           0 :       break;
    1859             :     case CCValAssign::SExt:
    1860           8 :       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
    1861          16 :                         DAG.getValueType(ValVT));
    1862           8 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1863           8 :       break;
    1864             :     case CCValAssign::ZExt:
    1865          12 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1866          24 :                         DAG.getValueType(ValVT));
    1867          12 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1868          12 :       break;
    1869             :     case CCValAssign::AExt:
    1870           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1871           7 :       break;
    1872           0 :     default:
    1873           0 :       llvm_unreachable("Unknown loc info!");
    1874             :     }
    1875             : 
    1876       12249 :     if (IsShader && Arg.VT.isVector()) {
    1877             :       // Build a vector from the registers
    1878        1336 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1879             :       unsigned NumElements = ParamType->getVectorNumElements();
    1880             : 
    1881             :       SmallVector<SDValue, 4> Regs;
    1882        1336 :       Regs.push_back(Val);
    1883       12892 :       for (unsigned j = 1; j != NumElements; ++j) {
    1884       11556 :         Reg = ArgLocs[ArgIdx++].getLocReg();
    1885        5778 :         Reg = MF.addLiveIn(Reg, RC);
    1886             : 
    1887        5778 :         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1888        5778 :         Regs.push_back(Copy);
    1889             :       }
    1890             : 
    1891             :       // Fill up the missing vector elements
    1892        1336 :       NumElements = Arg.VT.getVectorNumElements() - NumElements;
    1893        1336 :       Regs.append(NumElements, DAG.getUNDEF(VT));
    1894             : 
    1895        2672 :       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
    1896             :       continue;
    1897             :     }
    1898             : 
    1899        6457 :     InVals.push_back(Val);
    1900             :   }
    1901             : 
    1902       17854 :   if (!IsEntryFunc) {
    1903             :     // Special inputs come after user arguments.
    1904        1407 :     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    1905             :   }
    1906             : 
    1907             :   // Start adding system SGPRs.
    1908       17854 :   if (IsEntryFunc) {
    1909       16447 :     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
    1910             :   } else {
    1911        1407 :     CCInfo.AllocateReg(Info->getScratchRSrcReg());
    1912        1407 :     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
    1913        1407 :     CCInfo.AllocateReg(Info->getFrameOffsetReg());
    1914        1407 :     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    1915             :   }
    1916             : 
    1917             :   auto &ArgUsageInfo =
    1918       17854 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1919       17854 :   ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
    1920             : 
    1921       17854 :   unsigned StackArgSize = CCInfo.getNextStackOffset();
    1922             :   Info->setBytesInStackArgArea(StackArgSize);
    1923             : 
    1924       17854 :   return Chains.empty() ? Chain :
    1925       32134 :     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    1926             : }
    1927             : 
    1928             : // TODO: If return values can't fit in registers, we should return as many as
    1929             : // possible in registers before passing on stack.
    1930       18351 : bool SITargetLowering::CanLowerReturn(
    1931             :   CallingConv::ID CallConv,
    1932             :   MachineFunction &MF, bool IsVarArg,
    1933             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1934             :   LLVMContext &Context) const {
    1935             :   // Replacing returns with sret/stack usage doesn't make sense for shaders.
    1936             :   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
    1937             :   // for shaders. Vector types should be explicitly handled by CC.
    1938       18351 :   if (AMDGPU::isEntryFunctionCC(CallConv))
    1939             :     return true;
    1940             : 
    1941             :   SmallVector<CCValAssign, 16> RVLocs;
    1942        3802 :   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
    1943        1901 :   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
    1944             : }
    1945             : 
    1946             : SDValue
    1947       17790 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    1948             :                               bool isVarArg,
    1949             :                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    1950             :                               const SmallVectorImpl<SDValue> &OutVals,
    1951             :                               const SDLoc &DL, SelectionDAG &DAG) const {
    1952       17790 :   MachineFunction &MF = DAG.getMachineFunction();
    1953       17790 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1954             : 
    1955             :   if (AMDGPU::isKernel(CallConv)) {
    1956             :     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
    1957       15169 :                                              OutVals, DL, DAG);
    1958             :   }
    1959             : 
    1960        2621 :   bool IsShader = AMDGPU::isShader(CallConv);
    1961             : 
    1962        2621 :   Info->setIfReturnsVoid(Outs.size() == 0);
    1963        2621 :   bool IsWaveEnd = Info->returnsVoid() && IsShader;
    1964             : 
    1965             :   SmallVector<ISD::OutputArg, 48> Splits;
    1966             :   SmallVector<SDValue, 48> SplitVals;
    1967             : 
    1968             :   // Split vectors into their elements.
    1969        4597 :   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
    1970        1976 :     const ISD::OutputArg &Out = Outs[i];
    1971             : 
    1972        3140 :     if (IsShader && Out.VT.isVector()) {
    1973         490 :       MVT VT = Out.VT.getVectorElementType();
    1974         490 :       ISD::OutputArg NewOut = Out;
    1975             :       NewOut.Flags.setSplit();
    1976         490 :       NewOut.VT = VT;
    1977             : 
    1978             :       // We want the original number of vector elements here, e.g.
    1979             :       // three or five, not four or eight.
    1980         490 :       unsigned NumElements = Out.ArgVT.getVectorNumElements();
    1981             : 
    1982        4450 :       for (unsigned j = 0; j != NumElements; ++j) {
    1983             :         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
    1984        3960 :                                    DAG.getConstant(j, DL, MVT::i32));
    1985        1980 :         SplitVals.push_back(Elem);
    1986        1980 :         Splits.push_back(NewOut);
    1987        1980 :         NewOut.PartOffset += NewOut.VT.getStoreSize();
    1988             :       }
    1989             :     } else {
    1990        1486 :       SplitVals.push_back(OutVals[i]);
    1991        1486 :       Splits.push_back(Out);
    1992             :     }
    1993             :   }
    1994             : 
    1995             :   // CCValAssign - represent the assignment of the return value to a location.
    1996             :   SmallVector<CCValAssign, 48> RVLocs;
    1997             : 
    1998             :   // CCState - Info about the registers and stack slots.
    1999             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
    2000        5242 :                  *DAG.getContext());
    2001             : 
    2002             :   // Analyze outgoing return values.
    2003        2621 :   CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
    2004             : 
    2005        2621 :   SDValue Flag;
    2006             :   SmallVector<SDValue, 48> RetOps;
    2007        2621 :   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    2008             : 
    2009             :   // Add return address for callable functions.
    2010        2621 :   if (!Info->isEntryFunction()) {
    2011        1369 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2012             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2013        2738 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2014             : 
    2015             :     // FIXME: Should be able to use a vreg here, but need a way to prevent it
    2016             :     // from being allcoated to a CSR.
    2017             : 
    2018             :     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2019        1369 :                                                 MVT::i64);
    2020             : 
    2021        1369 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
    2022        1369 :     Flag = Chain.getValue(1);
    2023             : 
    2024        1369 :     RetOps.push_back(PhysReturnAddrReg);
    2025             :   }
    2026             : 
    2027             :   // Copy the result values into the output registers.
    2028        3466 :   for (unsigned i = 0, realRVLocIdx = 0;
    2029       12174 :        i != RVLocs.size();
    2030             :        ++i, ++realRVLocIdx) {
    2031             :     CCValAssign &VA = RVLocs[i];
    2032             :     assert(VA.isRegLoc() && "Can only return in registers!");
    2033             :     // TODO: Partially return in registers if return values don't fit.
    2034             : 
    2035        3466 :     SDValue Arg = SplitVals[realRVLocIdx];
    2036             : 
    2037             :     // Copied from other backends.
    2038        3466 :     switch (VA.getLocInfo()) {
    2039             :     case CCValAssign::Full:
    2040             :       break;
    2041             :     case CCValAssign::BCvt:
    2042           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2043           0 :       break;
    2044             :     case CCValAssign::SExt:
    2045           0 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2046           0 :       break;
    2047             :     case CCValAssign::ZExt:
    2048           0 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2049           0 :       break;
    2050             :     case CCValAssign::AExt:
    2051           3 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2052           3 :       break;
    2053           0 :     default:
    2054           0 :       llvm_unreachable("Unknown loc info!");
    2055             :     }
    2056             : 
    2057        3466 :     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
    2058        3466 :     Flag = Chain.getValue(1);
    2059        3466 :     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    2060             :   }
    2061             : 
    2062             :   // FIXME: Does sret work properly?
    2063        2621 :   if (!Info->isEntryFunction()) {
    2064        1369 :     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    2065             :     const MCPhysReg *I =
    2066        1369 :       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
    2067        1369 :     if (I) {
    2068           0 :       for (; *I; ++I) {
    2069           0 :         if (AMDGPU::SReg_64RegClass.contains(*I))
    2070           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
    2071           0 :         else if (AMDGPU::SReg_32RegClass.contains(*I))
    2072           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
    2073             :         else
    2074           0 :           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    2075             :       }
    2076             :     }
    2077             :   }
    2078             : 
    2079             :   // Update chain and glue.
    2080        2621 :   RetOps[0] = Chain;
    2081        2621 :   if (Flag.getNode())
    2082        2167 :     RetOps.push_back(Flag);
    2083             : 
    2084             :   unsigned Opc = AMDGPUISD::ENDPGM;
    2085        2621 :   if (!IsWaveEnd)
    2086        2167 :     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
    2087        2621 :   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
    2088             : }
    2089             : 
    2090         453 : SDValue SITargetLowering::LowerCallResult(
    2091             :     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
    2092             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    2093             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    2094             :     SDValue ThisVal) const {
    2095         453 :   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
    2096             : 
    2097             :   // Assign locations to each value returned by this call.
    2098             :   SmallVector<CCValAssign, 16> RVLocs;
    2099             :   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    2100         906 :                  *DAG.getContext());
    2101         453 :   CCInfo.AnalyzeCallResult(Ins, RetCC);
    2102             : 
    2103             :   // Copy all of the result registers out of their specified physreg.
    2104        1287 :   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    2105         127 :     CCValAssign VA = RVLocs[i];
    2106         127 :     SDValue Val;
    2107             : 
    2108         127 :     if (VA.isRegLoc()) {
    2109         127 :       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
    2110         127 :       Chain = Val.getValue(1);
    2111         127 :       InFlag = Val.getValue(2);
    2112             :     } else if (VA.isMemLoc()) {
    2113           0 :       report_fatal_error("TODO: return values in memory");
    2114             :     } else
    2115             :       llvm_unreachable("unknown argument location type");
    2116             : 
    2117         127 :     switch (VA.getLocInfo()) {
    2118             :     case CCValAssign::Full:
    2119             :       break;
    2120             :     case CCValAssign::BCvt:
    2121           0 :       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
    2122           0 :       break;
    2123             :     case CCValAssign::ZExt:
    2124           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
    2125          14 :                         DAG.getValueType(VA.getValVT()));
    2126           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2127           7 :       break;
    2128             :     case CCValAssign::SExt:
    2129           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
    2130          14 :                         DAG.getValueType(VA.getValVT()));
    2131           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2132           7 :       break;
    2133             :     case CCValAssign::AExt:
    2134           3 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    2135           3 :       break;
    2136           0 :     default:
    2137           0 :       llvm_unreachable("Unknown loc info!");
    2138             :     }
    2139             : 
    2140         127 :     InVals.push_back(Val);
    2141             :   }
    2142             : 
    2143         906 :   return Chain;
    2144             : }
    2145             : 
    2146             : // Add code to pass special inputs required depending on used features separate
    2147             : // from the explicit user arguments present in the IR.
    2148         487 : void SITargetLowering::passSpecialInputs(
    2149             :     CallLoweringInfo &CLI,
    2150             :     const SIMachineFunctionInfo &Info,
    2151             :     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    2152             :     SmallVectorImpl<SDValue> &MemOpChains,
    2153             :     SDValue Chain,
    2154             :     SDValue StackPtr) const {
    2155             :   // If we don't have a call site, this was a call inserted by
    2156             :   // legalization. These can never use special inputs.
    2157         487 :   if (!CLI.CS)
    2158           0 :     return;
    2159             : 
    2160             :   const Function *CalleeFunc = CLI.CS.getCalledFunction();
    2161             :   assert(CalleeFunc);
    2162             : 
    2163         487 :   SelectionDAG &DAG = CLI.DAG;
    2164         487 :   const SDLoc &DL = CLI.DL;
    2165             : 
    2166         487 :   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    2167             : 
    2168             :   auto &ArgUsageInfo =
    2169         487 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    2170             :   const AMDGPUFunctionArgInfo &CalleeArgInfo
    2171             :     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
    2172             : 
    2173             :   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
    2174             : 
    2175             :   // TODO: Unify with private memory register handling. This is complicated by
    2176             :   // the fact that at least in kernels, the input argument is not necessarily
    2177             :   // in the same location as the input.
    2178         487 :   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    2179             :     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    2180             :     AMDGPUFunctionArgInfo::QUEUE_PTR,
    2181             :     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
    2182             :     AMDGPUFunctionArgInfo::DISPATCH_ID,
    2183             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    2184             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    2185             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
    2186             :     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
    2187             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
    2188             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
    2189             :     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    2190             :   };
    2191             : 
    2192       11201 :   for (auto InputID : InputRegs) {
    2193             :     const ArgDescriptor *OutgoingArg;
    2194             :     const TargetRegisterClass *ArgRC;
    2195             : 
    2196       10714 :     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
    2197        5357 :     if (!OutgoingArg)
    2198        5236 :       continue;
    2199             : 
    2200             :     const ArgDescriptor *IncomingArg;
    2201             :     const TargetRegisterClass *IncomingArgRC;
    2202             :     std::tie(IncomingArg, IncomingArgRC)
    2203         242 :       = CallerArgInfo.getPreloadedValue(InputID);
    2204             :     assert(IncomingArgRC == ArgRC);
    2205             : 
    2206             :     // All special arguments are ints for now.
    2207         121 :     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
    2208         121 :     SDValue InputReg;
    2209             : 
    2210         121 :     if (IncomingArg) {
    2211         111 :       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
    2212             :     } else {
    2213             :       // The implicit arg ptr is special because it doesn't have a corresponding
    2214             :       // input for kernels, and is computed from the kernarg segment pointer.
    2215             :       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    2216          10 :       InputReg = getImplicitArgPtr(DAG, DL);
    2217             :     }
    2218             : 
    2219         242 :     if (OutgoingArg->isRegister()) {
    2220         111 :       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
    2221             :     } else {
    2222             :       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
    2223             :                                               InputReg,
    2224          10 :                                               OutgoingArg->getStackOffset());
    2225          10 :       MemOpChains.push_back(ArgStore);
    2226             :     }
    2227             :   }
    2228             : }
    2229             : 
    2230             : static bool canGuaranteeTCO(CallingConv::ID CC) {
    2231          39 :   return CC == CallingConv::Fast;
    2232             : }
    2233             : 
    2234             : /// Return true if we might ever do TCO for calls with this calling convention.
    2235             : static bool mayTailCallThisCC(CallingConv::ID CC) {
    2236          43 :   switch (CC) {
    2237             :   case CallingConv::C:
    2238             :     return true;
    2239             :   default:
    2240             :     return canGuaranteeTCO(CC);
    2241             :   }
    2242             : }
    2243             : 
    2244          43 : bool SITargetLowering::isEligibleForTailCallOptimization(
    2245             :     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    2246             :     const SmallVectorImpl<ISD::OutputArg> &Outs,
    2247             :     const SmallVectorImpl<SDValue> &OutVals,
    2248             :     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    2249          39 :   if (!mayTailCallThisCC(CalleeCC))
    2250             :     return false;
    2251             : 
    2252          43 :   MachineFunction &MF = DAG.getMachineFunction();
    2253          43 :   const Function &CallerF = MF.getFunction();
    2254             :   CallingConv::ID CallerCC = CallerF.getCallingConv();
    2255          43 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2256          43 :   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    2257             : 
    2258             :   // Kernels aren't callable, and don't have a live in return address so it
    2259             :   // doesn't make sense to do a tail call with entry functions.
    2260          43 :   if (!CallerPreserved)
    2261             :     return false;
    2262             : 
    2263             :   bool CCMatch = CallerCC == CalleeCC;
    2264             : 
    2265          40 :   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    2266           0 :     if (canGuaranteeTCO(CalleeCC) && CCMatch)
    2267             :       return true;
    2268             :     return false;
    2269             :   }
    2270             : 
    2271             :   // TODO: Can we handle var args?
    2272          40 :   if (IsVarArg)
    2273             :     return false;
    2274             : 
    2275         136 :   for (const Argument &Arg : CallerF.args()) {
    2276          99 :     if (Arg.hasByValAttr())
    2277             :       return false;
    2278             :   }
    2279             : 
    2280          37 :   LLVMContext &Ctx = *DAG.getContext();
    2281             : 
    2282             :   // Check that the call results are passed in the same way.
    2283          37 :   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
    2284             :                                   CCAssignFnForCall(CalleeCC, IsVarArg),
    2285             :                                   CCAssignFnForCall(CallerCC, IsVarArg)))
    2286             :     return false;
    2287             : 
    2288             :   // The callee has to preserve all registers the caller needs to preserve.
    2289          37 :   if (!CCMatch) {
    2290           0 :     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    2291           0 :     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    2292             :       return false;
    2293             :   }
    2294             : 
    2295             :   // Nothing more to check if the callee is taking no arguments.
    2296          37 :   if (Outs.empty())
    2297             :     return true;
    2298             : 
    2299             :   SmallVector<CCValAssign, 16> ArgLocs;
    2300          66 :   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
    2301             : 
    2302          33 :   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
    2303             : 
    2304          33 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    2305             :   // If the stack arguments for this call do not fit into our own save area then
    2306             :   // the call cannot be made tail.
    2307             :   // TODO: Is this really necessary?
    2308          33 :   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
    2309             :     return false;
    2310             : 
    2311          30 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
    2312          30 :   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
    2313             : }
    2314             : 
    2315          18 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
    2316          18 :   if (!CI->isTailCall())
    2317             :     return false;
    2318             : 
    2319           4 :   const Function *ParentFn = CI->getParent()->getParent();
    2320           4 :   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
    2321             :     return false;
    2322             : 
    2323           1 :   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
    2324           1 :   return (Attr.getValueAsString() != "true");
    2325             : }
    2326             : 
    2327             : // The wave scratch offset register is used as the global base pointer.
    2328         494 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    2329             :                                     SmallVectorImpl<SDValue> &InVals) const {
    2330         494 :   SelectionDAG &DAG = CLI.DAG;
    2331         494 :   const SDLoc &DL = CLI.DL;
    2332             :   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    2333             :   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    2334             :   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
    2335         494 :   SDValue Chain = CLI.Chain;
    2336         494 :   SDValue Callee = CLI.Callee;
    2337             :   bool &IsTailCall = CLI.IsTailCall;
    2338         494 :   CallingConv::ID CallConv = CLI.CallConv;
    2339         494 :   bool IsVarArg = CLI.IsVarArg;
    2340             :   bool IsSibCall = false;
    2341             :   bool IsThisReturn = false;
    2342         494 :   MachineFunction &MF = DAG.getMachineFunction();
    2343             : 
    2344         494 :   if (IsVarArg) {
    2345             :     return lowerUnhandledCall(CLI, InVals,
    2346           2 :                               "unsupported call to variadic function ");
    2347             :   }
    2348             : 
    2349             :   if (!CLI.CS.getCalledFunction()) {
    2350             :     return lowerUnhandledCall(CLI, InVals,
    2351           8 :                               "unsupported indirect call to function ");
    2352             :   }
    2353             : 
    2354         489 :   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
    2355             :     return lowerUnhandledCall(CLI, InVals,
    2356           2 :                               "unsupported required tail call to function ");
    2357             :   }
    2358             : 
    2359         976 :   if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
    2360             :     // Note the issue is with the CC of the calling function, not of the call
    2361             :     // itself.
    2362             :     return lowerUnhandledCall(CLI, InVals,
    2363           2 :                           "unsupported call from graphics shader of function ");
    2364             :   }
    2365             : 
    2366             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2367             :   const unsigned CalleeUsableStackOffset = 4;
    2368             : 
    2369         487 :   if (IsTailCall) {
    2370          43 :     IsTailCall = isEligibleForTailCallOptimization(
    2371             :       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
    2372          52 :     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
    2373           0 :       report_fatal_error("failed to perform tail call elimination on a call "
    2374             :                          "site marked musttail");
    2375             :     }
    2376             : 
    2377          43 :     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
    2378             : 
    2379             :     // A sibling call is one where we're under the usual C ABI and not planning
    2380             :     // to change that but can still do a tail call:
    2381          86 :     if (!TailCallOpt && IsTailCall)
    2382             :       IsSibCall = true;
    2383             : 
    2384             :     if (IsTailCall)
    2385             :       ++NumTailCalls;
    2386             :   }
    2387             : 
    2388             :   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
    2389             :     // FIXME: Remove this hack for function pointer types after removing
    2390             :     // support of old address space mapping. In the new address space
    2391             :     // mapping the pointer in default address space is 64 bit, therefore
    2392             :     // does not need this hack.
    2393         487 :     if (Callee.getValueType() == MVT::i32) {
    2394           0 :       const GlobalValue *GV = GA->getGlobal();
    2395           0 :       Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
    2396           0 :                                     GA->getTargetFlags());
    2397             :     }
    2398             :   }
    2399             :   assert(Callee.getValueType() == MVT::i64);
    2400             : 
    2401         487 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2402             : 
    2403             :   // Analyze operands of the call, assigning locations to each operand.
    2404             :   SmallVector<CCValAssign, 16> ArgLocs;
    2405         974 :   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    2406         487 :   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
    2407         487 :   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
    2408             : 
    2409             :   // Get a count of how many bytes are to be pushed on the stack.
    2410         487 :   unsigned NumBytes = CCInfo.getNextStackOffset();
    2411             : 
    2412         487 :   if (IsSibCall) {
    2413             :     // Since we're not changing the ABI to make this a tail call, the memory
    2414             :     // operands are already available in the caller's incoming argument space.
    2415             :     NumBytes = 0;
    2416             :   }
    2417             : 
    2418             :   // FPDiff is the byte offset of the call's argument area from the callee's.
    2419             :   // Stores to callee stack arguments will be placed in FixedStackSlots offset
    2420             :   // by this amount for a tail call. In a sibling call it must be 0 because the
    2421             :   // caller will deallocate the entire stack and the callee still expects its
    2422             :   // arguments to begin at SP+0. Completely unused for non-tail calls.
    2423             :   int32_t FPDiff = 0;
    2424         487 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    2425             :   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    2426             : 
    2427         487 :   SDValue CallerSavedFP;
    2428             : 
    2429             :   // Adjust the stack pointer for the new arguments...
    2430             :   // These operations are automatically eliminated by the prolog/epilog pass
    2431         487 :   if (!IsSibCall) {
    2432         453 :     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
    2433             : 
    2434         453 :     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
    2435             : 
    2436             :     // In the HSA case, this should be an identity copy.
    2437             :     SDValue ScratchRSrcReg
    2438         453 :       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
    2439         453 :     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
    2440             : 
    2441             :     // TODO: Don't hardcode these registers and get from the callee function.
    2442             :     SDValue ScratchWaveOffsetReg
    2443         453 :       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
    2444         453 :     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
    2445             : 
    2446         453 :     if (!Info->isEntryFunction()) {
    2447             :       // Avoid clobbering this function's FP value. In the current convention
    2448             :       // callee will overwrite this, so do save/restore around the call site.
    2449          98 :       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
    2450         196 :                                          Info->getFrameOffsetReg(), MVT::i32);
    2451             :     }
    2452             :   }
    2453             : 
    2454             :   // Stack pointer relative accesses are done by changing the offset SGPR. This
    2455             :   // is just the VGPR offset component.
    2456         487 :   SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
    2457             : 
    2458             :   SmallVector<SDValue, 8> MemOpChains;
    2459             :   MVT PtrVT = MVT::i32;
    2460             : 
    2461             :   // Walk the register/memloc assignments, inserting copies/loads.
    2462        1540 :   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
    2463             :        ++i, ++realArgIdx) {
    2464        1053 :     CCValAssign &VA = ArgLocs[i];
    2465        1053 :     SDValue Arg = OutVals[realArgIdx];
    2466             : 
    2467             :     // Promote the value if needed.
    2468        1053 :     switch (VA.getLocInfo()) {
    2469             :     case CCValAssign::Full:
    2470             :       break;
    2471             :     case CCValAssign::BCvt:
    2472           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2473           0 :       break;
    2474             :     case CCValAssign::ZExt:
    2475          10 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2476          10 :       break;
    2477             :     case CCValAssign::SExt:
    2478          10 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2479          10 :       break;
    2480             :     case CCValAssign::AExt:
    2481           4 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2482           4 :       break;
    2483             :     case CCValAssign::FPExt:
    2484           0 :       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
    2485           0 :       break;
    2486           0 :     default:
    2487           0 :       llvm_unreachable("Unknown loc info!");
    2488             :     }
    2489             : 
    2490        1053 :     if (VA.isRegLoc()) {
    2491        1982 :       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    2492             :     } else {
    2493             :       assert(VA.isMemLoc());
    2494             : 
    2495          62 :       SDValue DstAddr;
    2496             :       MachinePointerInfo DstInfo;
    2497             : 
    2498          62 :       unsigned LocMemOffset = VA.getLocMemOffset();
    2499          62 :       int32_t Offset = LocMemOffset;
    2500             : 
    2501          62 :       SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
    2502             : 
    2503          62 :       if (IsTailCall) {
    2504          27 :         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
    2505          27 :         unsigned OpSize = Flags.isByVal() ?
    2506          51 :           Flags.getByValSize() : VA.getValVT().getStoreSize();
    2507             : 
    2508             :         Offset = Offset + FPDiff;
    2509          27 :         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
    2510             : 
    2511          27 :         DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
    2512          54 :                                          StackPtr);
    2513          27 :         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
    2514             : 
    2515             :         // Make sure any stack arguments overlapping with where we're storing
    2516             :         // are loaded before this eventual operation. Otherwise they'll be
    2517             :         // clobbered.
    2518             : 
    2519             :         // FIXME: Why is this really necessary? This seems to just result in a
    2520             :         // lot of code to copy the stack and write them back to the same
    2521             :         // locations, which are supposed to be immutable?
    2522          27 :         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
    2523             :       } else {
    2524          35 :         DstAddr = PtrOff;
    2525          35 :         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
    2526             :       }
    2527             : 
    2528          62 :       if (Outs[i].Flags.isByVal()) {
    2529             :         SDValue SizeNode =
    2530          28 :             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
    2531             :         SDValue Cpy = DAG.getMemcpy(
    2532             :             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
    2533             :             /*isVol = */ false, /*AlwaysInline = */ true,
    2534             :             /*isTailCall = */ false, DstInfo,
    2535          28 :             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
    2536          56 :                 *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
    2537             : 
    2538          28 :         MemOpChains.push_back(Cpy);
    2539             :       } else {
    2540          34 :         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
    2541          34 :         MemOpChains.push_back(Store);
    2542             :       }
    2543             :     }
    2544             :   }
    2545             : 
    2546             :   // Copy special input registers after user input arguments.
    2547         487 :   passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
    2548             : 
    2549         487 :   if (!MemOpChains.empty())
    2550          46 :     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    2551             : 
    2552             :   // Build a sequence of copy-to-reg nodes chained together with token chain
    2553             :   // and flag operands which copy the outgoing args into the appropriate regs.
    2554         487 :   SDValue InFlag;
    2555        4503 :   for (auto &RegToPass : RegsToPass) {
    2556        2008 :     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
    2557        2008 :                              RegToPass.second, InFlag);
    2558        2008 :     InFlag = Chain.getValue(1);
    2559             :   }
    2560             : 
    2561             : 
    2562         487 :   SDValue PhysReturnAddrReg;
    2563         487 :   if (IsTailCall) {
    2564             :     // Since the return is being combined with the call, we need to pass on the
    2565             :     // return address.
    2566             : 
    2567          34 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2568             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2569          68 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2570             : 
    2571          34 :     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2572          68 :                                         MVT::i64);
    2573          34 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
    2574          34 :     InFlag = Chain.getValue(1);
    2575             :   }
    2576             : 
    2577             :   // We don't usually want to end the call-sequence here because we would tidy
    2578             :   // the frame up *after* the call, however in the ABI-changing tail-call case
    2579             :   // we've carefully laid out the parameters so that when sp is reset they'll be
    2580             :   // in the correct location.
    2581         487 :   if (IsTailCall && !IsSibCall) {
    2582           0 :     Chain = DAG.getCALLSEQ_END(Chain,
    2583             :                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2584             :                                DAG.getTargetConstant(0, DL, MVT::i32),
    2585           0 :                                InFlag, DL);
    2586           0 :     InFlag = Chain.getValue(1);
    2587             :   }
    2588             : 
    2589             :   std::vector<SDValue> Ops;
    2590         487 :   Ops.push_back(Chain);
    2591         487 :   Ops.push_back(Callee);
    2592             : 
    2593         487 :   if (IsTailCall) {
    2594             :     // Each tail call may have to adjust the stack by a different amount, so
    2595             :     // this information must travel along with the operation for eventual
    2596             :     // consumption by emitEpilogue.
    2597          68 :     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
    2598             : 
    2599          34 :     Ops.push_back(PhysReturnAddrReg);
    2600             :   }
    2601             : 
    2602             :   // Add argument registers to the end of the list so that they are known live
    2603             :   // into the call.
    2604        4503 :   for (auto &RegToPass : RegsToPass) {
    2605        4016 :     Ops.push_back(DAG.getRegister(RegToPass.first,
    2606        4016 :                                   RegToPass.second.getValueType()));
    2607             :   }
    2608             : 
    2609             :   // Add a register mask operand representing the call-preserved registers.
    2610             : 
    2611         487 :   auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
    2612         487 :   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
    2613             :   assert(Mask && "Missing call preserved mask for calling convention");
    2614         974 :   Ops.push_back(DAG.getRegisterMask(Mask));
    2615             : 
    2616         487 :   if (InFlag.getNode())
    2617         487 :     Ops.push_back(InFlag);
    2618             : 
    2619         487 :   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    2620             : 
    2621             :   // If we're doing a tall call, use a TC_RETURN here rather than an
    2622             :   // actual call instruction.
    2623         487 :   if (IsTailCall) {
    2624             :     MFI.setHasTailCall();
    2625          34 :     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
    2626             :   }
    2627             : 
    2628             :   // Returns a chain and a flag for retval copy to use.
    2629         453 :   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
    2630         453 :   Chain = Call.getValue(0);
    2631         453 :   InFlag = Call.getValue(1);
    2632             : 
    2633         453 :   if (CallerSavedFP) {
    2634          98 :     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
    2635          98 :     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
    2636          98 :     InFlag = Chain.getValue(1);
    2637             :   }
    2638             : 
    2639         453 :   uint64_t CalleePopBytes = NumBytes;
    2640         453 :   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
    2641             :                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
    2642             :                              InFlag, DL);
    2643         453 :   if (!Ins.empty())
    2644         113 :     InFlag = Chain.getValue(1);
    2645             : 
    2646             :   // Handle result values, copying them out of physregs into vregs that we
    2647             :   // return.
    2648             :   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
    2649             :                          InVals, IsThisReturn,
    2650         453 :                          IsThisReturn ? OutVals[0] : SDValue());
    2651             : }
    2652             : 
    2653          27 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
    2654             :                                              SelectionDAG &DAG) const {
    2655          27 :   unsigned Reg = StringSwitch<unsigned>(RegName)
    2656             :     .Case("m0", AMDGPU::M0)
    2657             :     .Case("exec", AMDGPU::EXEC)
    2658             :     .Case("exec_lo", AMDGPU::EXEC_LO)
    2659             :     .Case("exec_hi", AMDGPU::EXEC_HI)
    2660             :     .Case("flat_scratch", AMDGPU::FLAT_SCR)
    2661             :     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
    2662             :     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
    2663             :     .Default(AMDGPU::NoRegister);
    2664             : 
    2665          27 :   if (Reg == AMDGPU::NoRegister) {
    2666           0 :     report_fatal_error(Twine("invalid register name \""
    2667             :                              + StringRef(RegName)  + "\"."));
    2668             : 
    2669             :   }
    2670             : 
    2671          30 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    2672           3 :       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
    2673           1 :     report_fatal_error(Twine("invalid register \""
    2674             :                              + StringRef(RegName)  + "\" for subtarget."));
    2675             :   }
    2676             : 
    2677             :   switch (Reg) {
    2678          17 :   case AMDGPU::M0:
    2679             :   case AMDGPU::EXEC_LO:
    2680             :   case AMDGPU::EXEC_HI:
    2681             :   case AMDGPU::FLAT_SCR_LO:
    2682             :   case AMDGPU::FLAT_SCR_HI:
    2683          17 :     if (VT.getSizeInBits() == 32)
    2684             :       return Reg;
    2685             :     break;
    2686           9 :   case AMDGPU::EXEC:
    2687             :   case AMDGPU::FLAT_SCR:
    2688           9 :     if (VT.getSizeInBits() == 64)
    2689             :       return Reg;
    2690             :     break;
    2691           0 :   default:
    2692           0 :     llvm_unreachable("missing register type checking");
    2693             :   }
    2694             : 
    2695           2 :   report_fatal_error(Twine("invalid type for register \""
    2696             :                            + StringRef(RegName) + "\"."));
    2697             : }
    2698             : 
    2699             : // If kill is not the last instruction, split the block so kill is always a
    2700             : // proper terminator.
    2701          84 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
    2702             :                                                     MachineBasicBlock *BB) const {
    2703          84 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2704             : 
    2705             :   MachineBasicBlock::iterator SplitPoint(&MI);
    2706             :   ++SplitPoint;
    2707             : 
    2708          84 :   if (SplitPoint == BB->end()) {
    2709             :     // Don't bother with a new block.
    2710           8 :     MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2711           4 :     return BB;
    2712             :   }
    2713             : 
    2714          80 :   MachineFunction *MF = BB->getParent();
    2715             :   MachineBasicBlock *SplitBB
    2716          80 :     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
    2717             : 
    2718             :   MF->insert(++MachineFunction::iterator(BB), SplitBB);
    2719             :   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
    2720             : 
    2721          80 :   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
    2722          80 :   BB->addSuccessor(SplitBB);
    2723             : 
    2724         160 :   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
    2725          80 :   return SplitBB;
    2726             : }
    2727             : 
    2728             : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
    2729             : // wavefront. If the value is uniform and just happens to be in a VGPR, this
    2730             : // will only do one iteration. In the worst case, this will loop 64 times.
    2731             : //
    2732             : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
    2733          32 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
    2734             :   const SIInstrInfo *TII,
    2735             :   MachineRegisterInfo &MRI,
    2736             :   MachineBasicBlock &OrigBB,
    2737             :   MachineBasicBlock &LoopBB,
    2738             :   const DebugLoc &DL,
    2739             :   const MachineOperand &IdxReg,
    2740             :   unsigned InitReg,
    2741             :   unsigned ResultReg,
    2742             :   unsigned PhiReg,
    2743             :   unsigned InitSaveExecReg,
    2744             :   int Offset,
    2745             :   bool UseGPRIdxMode,
    2746             :   bool IsIndirectSrc) {
    2747          32 :   MachineBasicBlock::iterator I = LoopBB.begin();
    2748             : 
    2749          32 :   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2750          32 :   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2751          32 :   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2752          32 :   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2753             : 
    2754          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
    2755          32 :     .addReg(InitReg)
    2756             :     .addMBB(&OrigBB)
    2757          32 :     .addReg(ResultReg)
    2758             :     .addMBB(&LoopBB);
    2759             : 
    2760          64 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
    2761          32 :     .addReg(InitSaveExecReg)
    2762             :     .addMBB(&OrigBB)
    2763          32 :     .addReg(NewExec)
    2764             :     .addMBB(&LoopBB);
    2765             : 
    2766             :   // Read the next variant <- also loop target.
    2767          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
    2768          32 :     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    2769             : 
    2770             :   // Compare the just read M0 value to all possible Idx values.
    2771          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
    2772          32 :     .addReg(CurrentIdxReg)
    2773          32 :     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
    2774             : 
    2775             :   // Update EXEC, save the original EXEC value to VCC.
    2776          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
    2777          32 :     .addReg(CondReg, RegState::Kill);
    2778             : 
    2779             :   MRI.setSimpleHint(NewExec, CondReg);
    2780             : 
    2781          32 :   if (UseGPRIdxMode) {
    2782             :     unsigned IdxReg;
    2783          16 :     if (Offset == 0) {
    2784             :       IdxReg = CurrentIdxReg;
    2785             :     } else {
    2786           6 :       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2787          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
    2788           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2789           6 :         .addImm(Offset);
    2790             :     }
    2791          16 :     unsigned IdxMode = IsIndirectSrc ?
    2792             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2793             :     MachineInstr *SetOn =
    2794          48 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2795          16 :       .addReg(IdxReg, RegState::Kill)
    2796          32 :       .addImm(IdxMode);
    2797          16 :     SetOn->getOperand(3).setIsUndef();
    2798             :   } else {
    2799             :     // Move index from VCC into M0
    2800          16 :     if (Offset == 0) {
    2801          30 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2802          10 :         .addReg(CurrentIdxReg, RegState::Kill);
    2803             :     } else {
    2804          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2805           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2806           6 :         .addImm(Offset);
    2807             :     }
    2808             :   }
    2809             : 
    2810             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    2811             :   MachineInstr *InsertPt =
    2812          96 :     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    2813          32 :     .addReg(AMDGPU::EXEC)
    2814          32 :     .addReg(NewExec);
    2815             : 
    2816             :   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
    2817             :   // s_cbranch_scc0?
    2818             : 
    2819             :   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
    2820          96 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    2821             :     .addMBB(&LoopBB);
    2822             : 
    2823          32 :   return InsertPt->getIterator();
    2824             : }
    2825             : 
    2826             : // This has slightly sub-optimal regalloc when the source vector is killed by
    2827             : // the read. The register allocator does not understand that the kill is
    2828             : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
    2829             : // subregister from it, using 1 more VGPR than necessary. This was saved when
    2830             : // this was expanded after register allocation.
    2831          32 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    2832             :                                                   MachineBasicBlock &MBB,
    2833             :                                                   MachineInstr &MI,
    2834             :                                                   unsigned InitResultReg,
    2835             :                                                   unsigned PhiReg,
    2836             :                                                   int Offset,
    2837             :                                                   bool UseGPRIdxMode,
    2838             :                                                   bool IsIndirectSrc) {
    2839          32 :   MachineFunction *MF = MBB.getParent();
    2840          32 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2841             :   const DebugLoc &DL = MI.getDebugLoc();
    2842             :   MachineBasicBlock::iterator I(&MI);
    2843             : 
    2844          32 :   unsigned DstReg = MI.getOperand(0).getReg();
    2845          32 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2846          32 :   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    2847             : 
    2848          64 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
    2849             : 
    2850             :   // Save the EXEC mask
    2851          96 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
    2852          32 :     .addReg(AMDGPU::EXEC);
    2853             : 
    2854             :   // To insert the loop we need to split the block. Move everything after this
    2855             :   // point to a new block, and insert a new empty block between the two.
    2856          32 :   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    2857          32 :   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    2858             :   MachineFunction::iterator MBBI(MBB);
    2859             :   ++MBBI;
    2860             : 
    2861             :   MF->insert(MBBI, LoopBB);
    2862             :   MF->insert(MBBI, RemainderBB);
    2863             : 
    2864          32 :   LoopBB->addSuccessor(LoopBB);
    2865          32 :   LoopBB->addSuccessor(RemainderBB);
    2866             : 
    2867             :   // Move the rest of the block into a new block.
    2868          32 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    2869             :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    2870             : 
    2871          32 :   MBB.addSuccessor(LoopBB);
    2872             : 
    2873          32 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2874             : 
    2875             :   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
    2876             :                                       InitResultReg, DstReg, PhiReg, TmpExec,
    2877          32 :                                       Offset, UseGPRIdxMode, IsIndirectSrc);
    2878             : 
    2879          32 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    2880          96 :   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    2881          32 :     .addReg(SaveExec);
    2882             : 
    2883          32 :   return InsPt;
    2884             : }
    2885             : 
    2886             : // Returns subreg index, offset
    2887             : static std::pair<unsigned, int>
    2888         161 : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
    2889             :                             const TargetRegisterClass *SuperRC,
    2890             :                             unsigned VecReg,
    2891             :                             int Offset) {
    2892         161 :   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
    2893             : 
    2894             :   // Skip out of bounds offsets, or else we would end up using an undefined
    2895             :   // register.
    2896         161 :   if (Offset >= NumElts || Offset < 0)
    2897          40 :     return std::make_pair(AMDGPU::sub0, Offset);
    2898             : 
    2899         242 :   return std::make_pair(AMDGPU::sub0 + Offset, 0);
    2900             : }
    2901             : 
    2902             : // Return true if the index is an SGPR and was set.
    2903         161 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
    2904             :                                  MachineRegisterInfo &MRI,
    2905             :                                  MachineInstr &MI,
    2906             :                                  int Offset,
    2907             :                                  bool UseGPRIdxMode,
    2908             :                                  bool IsIndirectSrc) {
    2909         161 :   MachineBasicBlock *MBB = MI.getParent();
    2910             :   const DebugLoc &DL = MI.getDebugLoc();
    2911             :   MachineBasicBlock::iterator I(&MI);
    2912             : 
    2913         161 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2914         161 :   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
    2915             : 
    2916             :   assert(Idx->getReg() != AMDGPU::NoRegister);
    2917             : 
    2918         161 :   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
    2919             :     return false;
    2920             : 
    2921         129 :   if (UseGPRIdxMode) {
    2922          31 :     unsigned IdxMode = IsIndirectSrc ?
    2923             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2924          31 :     if (Offset == 0) {
    2925             :       MachineInstr *SetOn =
    2926          34 :           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2927             :               .add(*Idx)
    2928          17 :               .addImm(IdxMode);
    2929             : 
    2930          17 :       SetOn->getOperand(3).setIsUndef();
    2931             :     } else {
    2932          14 :       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    2933          28 :       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
    2934             :           .add(*Idx)
    2935          14 :           .addImm(Offset);
    2936             :       MachineInstr *SetOn =
    2937          42 :         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2938          14 :         .addReg(Tmp, RegState::Kill)
    2939          28 :         .addImm(IdxMode);
    2940             : 
    2941          14 :       SetOn->getOperand(3).setIsUndef();
    2942             :     }
    2943             : 
    2944             :     return true;
    2945             :   }
    2946             : 
    2947          98 :   if (Offset == 0) {
    2948         252 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2949             :       .add(*Idx);
    2950             :   } else {
    2951          28 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2952             :       .add(*Idx)
    2953          14 :       .addImm(Offset);
    2954             :   }
    2955             : 
    2956             :   return true;
    2957             : }
    2958             : 
    2959             : // Control flow needs to be inserted if indexing with a VGPR.
    2960          71 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
    2961             :                                           MachineBasicBlock &MBB,
    2962             :                                           const SISubtarget &ST) {
    2963             :   const SIInstrInfo *TII = ST.getInstrInfo();
    2964             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2965          71 :   MachineFunction *MF = MBB.getParent();
    2966          71 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2967             : 
    2968          71 :   unsigned Dst = MI.getOperand(0).getReg();
    2969          71 :   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
    2970          71 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2971             : 
    2972             :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
    2973             : 
    2974             :   unsigned SubReg;
    2975             :   std::tie(SubReg, Offset)
    2976         142 :     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
    2977             : 
    2978             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    2979             : 
    2980          71 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
    2981             :     MachineBasicBlock::iterator I(&MI);
    2982             :     const DebugLoc &DL = MI.getDebugLoc();
    2983             : 
    2984          59 :     if (UseGPRIdxMode) {
    2985             :       // TODO: Look at the uses to avoid the copy. This may require rescheduling
    2986             :       // to avoid interfering with other uses, so probably requires a new
    2987             :       // optimization pass.
    2988          51 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2989          17 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2990          17 :         .addReg(SrcReg, RegState::Implicit)
    2991          17 :         .addReg(AMDGPU::M0, RegState::Implicit);
    2992          34 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2993             :     } else {
    2994         126 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2995          42 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2996          42 :         .addReg(SrcReg, RegState::Implicit);
    2997             :     }
    2998             : 
    2999          59 :     MI.eraseFromParent();
    3000             : 
    3001             :     return &MBB;
    3002             :   }
    3003             : 
    3004             :   const DebugLoc &DL = MI.getDebugLoc();
    3005             :   MachineBasicBlock::iterator I(&MI);
    3006             : 
    3007          12 :   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3008          12 :   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3009             : 
    3010          24 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
    3011             : 
    3012             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
    3013          12 :                               Offset, UseGPRIdxMode, true);
    3014          12 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3015             : 
    3016          12 :   if (UseGPRIdxMode) {
    3017          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    3018           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3019           6 :       .addReg(SrcReg, RegState::Implicit)
    3020           6 :       .addReg(AMDGPU::M0, RegState::Implicit);
    3021          12 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3022             :   } else {
    3023          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    3024           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    3025           6 :       .addReg(SrcReg, RegState::Implicit);
    3026             :   }
    3027             : 
    3028          12 :   MI.eraseFromParent();
    3029             : 
    3030          12 :   return LoopBB;
    3031             : }
    3032             : 
    3033          66 : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
    3034             :                                  const TargetRegisterClass *VecRC) {
    3035          66 :   switch (TRI.getRegSizeInBits(*VecRC)) {
    3036             :   case 32: // 4 bytes
    3037             :     return AMDGPU::V_MOVRELD_B32_V1;
    3038           6 :   case 64: // 8 bytes
    3039           6 :     return AMDGPU::V_MOVRELD_B32_V2;
    3040          38 :   case 128: // 16 bytes
    3041          38 :     return AMDGPU::V_MOVRELD_B32_V4;
    3042          16 :   case 256: // 32 bytes
    3043          16 :     return AMDGPU::V_MOVRELD_B32_V8;
    3044           6 :   case 512: // 64 bytes
    3045           6 :     return AMDGPU::V_MOVRELD_B32_V16;
    3046           0 :   default:
    3047           0 :     llvm_unreachable("unsupported size for MOVRELD pseudos");
    3048             :   }
    3049             : }
    3050             : 
    3051          90 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
    3052             :                                           MachineBasicBlock &MBB,
    3053             :                                           const SISubtarget &ST) {
    3054             :   const SIInstrInfo *TII = ST.getInstrInfo();
    3055             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    3056          90 :   MachineFunction *MF = MBB.getParent();
    3057          90 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    3058             : 
    3059          90 :   unsigned Dst = MI.getOperand(0).getReg();
    3060          90 :   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    3061          90 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    3062          90 :   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    3063          90 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    3064          90 :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
    3065             : 
    3066             :   // This can be an immediate, but will be folded later.
    3067             :   assert(Val->getReg());
    3068             : 
    3069             :   unsigned SubReg;
    3070         180 :   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
    3071             :                                                          SrcVec->getReg(),
    3072             :                                                          Offset);
    3073             :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    3074             : 
    3075          90 :   if (Idx->getReg() == AMDGPU::NoRegister) {
    3076             :     MachineBasicBlock::iterator I(&MI);
    3077             :     const DebugLoc &DL = MI.getDebugLoc();
    3078             : 
    3079             :     assert(Offset == 0);
    3080             : 
    3081           0 :     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
    3082             :         .add(*SrcVec)
    3083             :         .add(*Val)
    3084           0 :         .addImm(SubReg);
    3085             : 
    3086           0 :     MI.eraseFromParent();
    3087             :     return &MBB;
    3088             :   }
    3089             : 
    3090          90 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
    3091             :     MachineBasicBlock::iterator I(&MI);
    3092             :     const DebugLoc &DL = MI.getDebugLoc();
    3093             : 
    3094          70 :     if (UseGPRIdxMode) {
    3095          42 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3096          14 :           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
    3097             :           .add(*Val)
    3098          14 :           .addReg(Dst, RegState::ImplicitDefine)
    3099          14 :           .addReg(SrcVec->getReg(), RegState::Implicit)
    3100          14 :           .addReg(AMDGPU::M0, RegState::Implicit);
    3101             : 
    3102          28 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3103             :     } else {
    3104          56 :       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3105             : 
    3106         112 :       BuildMI(MBB, I, DL, MovRelDesc)
    3107          56 :           .addReg(Dst, RegState::Define)
    3108          56 :           .addReg(SrcVec->getReg())
    3109             :           .add(*Val)
    3110          56 :           .addImm(SubReg - AMDGPU::sub0);
    3111             :     }
    3112             : 
    3113          70 :     MI.eraseFromParent();
    3114             :     return &MBB;
    3115             :   }
    3116             : 
    3117          20 :   if (Val->isReg())
    3118          20 :     MRI.clearKillFlags(Val->getReg());
    3119             : 
    3120             :   const DebugLoc &DL = MI.getDebugLoc();
    3121             : 
    3122          20 :   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
    3123             : 
    3124             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
    3125          20 :                               Offset, UseGPRIdxMode, false);
    3126          20 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    3127             : 
    3128          20 :   if (UseGPRIdxMode) {
    3129          30 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    3130          10 :         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
    3131             :         .add(*Val)                               // src0
    3132          10 :         .addReg(Dst, RegState::ImplicitDefine)
    3133          10 :         .addReg(PhiReg, RegState::Implicit)
    3134          10 :         .addReg(AMDGPU::M0, RegState::Implicit);
    3135          20 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    3136             :   } else {
    3137          10 :     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    3138             : 
    3139          20 :     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
    3140          10 :         .addReg(Dst, RegState::Define)
    3141          10 :         .addReg(PhiReg)
    3142             :         .add(*Val)
    3143          10 :         .addImm(SubReg - AMDGPU::sub0);
    3144             :   }
    3145             : 
    3146          20 :   MI.eraseFromParent();
    3147             : 
    3148          20 :   return LoopBB;
    3149             : }
    3150             : 
    3151       13667 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    3152             :   MachineInstr &MI, MachineBasicBlock *BB) const {
    3153             : 
    3154       13667 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3155       13667 :   MachineFunction *MF = BB->getParent();
    3156       13667 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    3157             : 
    3158       13667 :   if (TII->isMIMG(MI)) {
    3159         652 :     if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
    3160           0 :       report_fatal_error("missing mem operand from MIMG instruction");
    3161             :     }
    3162             :     // Add a memoperand for mimg instructions so that they aren't assumed to
    3163             :     // be ordered memory instuctions.
    3164             : 
    3165             :     return BB;
    3166             :   }
    3167             : 
    3168       13015 :   switch (MI.getOpcode()) {
    3169        2269 :   case AMDGPU::S_ADD_U64_PSEUDO:
    3170             :   case AMDGPU::S_SUB_U64_PSEUDO: {
    3171        2269 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3172             :     const DebugLoc &DL = MI.getDebugLoc();
    3173             : 
    3174        2269 :     MachineOperand &Dest = MI.getOperand(0);
    3175             :     MachineOperand &Src0 = MI.getOperand(1);
    3176             :     MachineOperand &Src1 = MI.getOperand(2);
    3177             : 
    3178        2269 :     unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3179        2269 :     unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3180             : 
    3181             :     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3182             :      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3183        2269 :      &AMDGPU::SReg_32_XM0RegClass);
    3184             :     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3185             :       Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3186        2269 :       &AMDGPU::SReg_32_XM0RegClass);
    3187             : 
    3188             :     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
    3189             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
    3190        2269 :       &AMDGPU::SReg_32_XM0RegClass);
    3191             :     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
    3192             :       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
    3193        2269 :       &AMDGPU::SReg_32_XM0RegClass);
    3194             : 
    3195        2269 :     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    3196             : 
    3197        2269 :     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
    3198        2269 :     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
    3199        4538 :     BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
    3200             :       .add(Src0Sub0)
    3201             :       .add(Src1Sub0);
    3202        4538 :     BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
    3203             :       .add(Src0Sub1)
    3204             :       .add(Src1Sub1);
    3205        6807 :     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
    3206        2269 :       .addReg(DestSub0)
    3207             :       .addImm(AMDGPU::sub0)
    3208        2269 :       .addReg(DestSub1)
    3209             :       .addImm(AMDGPU::sub1);
    3210        2269 :     MI.eraseFromParent();
    3211             :     return BB;
    3212             :   }
    3213        8913 :   case AMDGPU::SI_INIT_M0: {
    3214       17826 :     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
    3215       17826 :             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    3216        8913 :         .add(MI.getOperand(0));
    3217        8913 :     MI.eraseFromParent();
    3218        8913 :     return BB;
    3219             :   }
    3220           3 :   case AMDGPU::SI_INIT_EXEC:
    3221             :     // This should be before all vector instructions.
    3222             :     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    3223           3 :             AMDGPU::EXEC)
    3224           3 :         .addImm(MI.getOperand(0).getImm());
    3225           3 :     MI.eraseFromParent();
    3226           3 :     return BB;
    3227             : 
    3228             :   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    3229             :     // Extract the thread count from an SGPR input and set EXEC accordingly.
    3230             :     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    3231             :     //
    3232             :     // S_BFE_U32 count, input, {shift, 7}
    3233             :     // S_BFM_B64 exec, count, 0
    3234             :     // S_CMP_EQ_U32 count, 64
    3235             :     // S_CMOV_B64 exec, -1
    3236             :     MachineInstr *FirstMI = &*BB->begin();
    3237           4 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3238           4 :     unsigned InputReg = MI.getOperand(0).getReg();
    3239           4 :     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3240             :     bool Found = false;
    3241             : 
    3242             :     // Move the COPY of the input reg to the beginning, so that we can use it.
    3243          14 :     for (auto I = BB->begin(); I != &MI; I++) {
    3244          36 :       if (I->getOpcode() != TargetOpcode::COPY ||
    3245          10 :           I->getOperand(0).getReg() != InputReg)
    3246             :         continue;
    3247             : 
    3248           4 :       if (I == FirstMI) {
    3249           0 :         FirstMI = &*++BB->begin();
    3250             :       } else {
    3251           4 :         I->removeFromParent();
    3252             :         BB->insert(FirstMI, &*I);
    3253             :       }
    3254             :       Found = true;
    3255             :       break;
    3256             :     }
    3257             :     assert(Found);
    3258             :     (void)Found;
    3259             : 
    3260             :     // This should be before all vector instructions.
    3261          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
    3262           4 :         .addReg(InputReg)
    3263           4 :         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
    3264          16 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
    3265           4 :             AMDGPU::EXEC)
    3266           4 :         .addReg(CountReg)
    3267             :         .addImm(0);
    3268          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
    3269           4 :         .addReg(CountReg, RegState::Kill)
    3270             :         .addImm(64);
    3271           8 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
    3272           4 :             AMDGPU::EXEC)
    3273             :         .addImm(-1);
    3274           4 :     MI.eraseFromParent();
    3275           4 :     return BB;
    3276             :   }
    3277             : 
    3278             :   case AMDGPU::GET_GROUPSTATICSIZE: {
    3279             :     DebugLoc DL = MI.getDebugLoc();
    3280         122 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
    3281          61 :         .add(MI.getOperand(0))
    3282          61 :         .addImm(MFI->getLDSSize());
    3283          61 :     MI.eraseFromParent();
    3284             :     return BB;
    3285             :   }
    3286          71 :   case AMDGPU::SI_INDIRECT_SRC_V1:
    3287             :   case AMDGPU::SI_INDIRECT_SRC_V2:
    3288             :   case AMDGPU::SI_INDIRECT_SRC_V4:
    3289             :   case AMDGPU::SI_INDIRECT_SRC_V8:
    3290             :   case AMDGPU::SI_INDIRECT_SRC_V16:
    3291          71 :     return emitIndirectSrc(MI, *BB, *getSubtarget());
    3292          90 :   case AMDGPU::SI_INDIRECT_DST_V1:
    3293             :   case AMDGPU::SI_INDIRECT_DST_V2:
    3294             :   case AMDGPU::SI_INDIRECT_DST_V4:
    3295             :   case AMDGPU::SI_INDIRECT_DST_V8:
    3296             :   case AMDGPU::SI_INDIRECT_DST_V16:
    3297          90 :     return emitIndirectDst(MI, *BB, *getSubtarget());
    3298          84 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    3299             :   case AMDGPU::SI_KILL_I1_PSEUDO:
    3300          84 :     return splitKillBlock(MI, BB);
    3301          49 :   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    3302          49 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    3303             : 
    3304          49 :     unsigned Dst = MI.getOperand(0).getReg();
    3305          49 :     unsigned Src0 = MI.getOperand(1).getReg();
    3306          49 :     unsigned Src1 = MI.getOperand(2).getReg();
    3307             :     const DebugLoc &DL = MI.getDebugLoc();
    3308          49 :     unsigned SrcCond = MI.getOperand(3).getReg();
    3309             : 
    3310          49 :     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3311          49 :     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3312          49 :     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    3313             : 
    3314         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
    3315          49 :       .addReg(SrcCond);
    3316         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    3317          49 :       .addReg(Src0, 0, AMDGPU::sub0)
    3318          49 :       .addReg(Src1, 0, AMDGPU::sub0)
    3319          49 :       .addReg(SrcCondCopy);
    3320         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    3321          49 :       .addReg(Src0, 0, AMDGPU::sub1)
    3322          49 :       .addReg(Src1, 0, AMDGPU::sub1)
    3323          49 :       .addReg(SrcCondCopy);
    3324             : 
    3325         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
    3326          49 :       .addReg(DstLo)
    3327             :       .addImm(AMDGPU::sub0)
    3328          49 :       .addReg(DstHi)
    3329             :       .addImm(AMDGPU::sub1);
    3330          49 :     MI.eraseFromParent();
    3331          49 :     return BB;
    3332             :   }
    3333          78 :   case AMDGPU::SI_BR_UNDEF: {
    3334          78 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3335             :     const DebugLoc &DL = MI.getDebugLoc();
    3336         234 :     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
    3337          78 :                            .add(MI.getOperand(0));
    3338          78 :     Br->getOperand(1).setIsUndef(true); // read undef SCC
    3339          78 :     MI.eraseFromParent();
    3340          78 :     return BB;
    3341             :   }
    3342         906 :   case AMDGPU::ADJCALLSTACKUP:
    3343             :   case AMDGPU::ADJCALLSTACKDOWN: {
    3344         906 :     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    3345             :     MachineInstrBuilder MIB(*MF, &MI);
    3346             : 
    3347             :     // Add an implicit use of the frame offset reg to prevent the restore copy
    3348             :     // inserted after the call from being reorderd after stack operations in the
    3349             :     // the caller's frame.
    3350         906 :     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
    3351         906 :         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
    3352         906 :         .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
    3353             :     return BB;
    3354             :   }
    3355         487 :   case AMDGPU::SI_CALL_ISEL:
    3356             :   case AMDGPU::SI_TCRETURN_ISEL: {
    3357         487 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3358             :     const DebugLoc &DL = MI.getDebugLoc();
    3359         487 :     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
    3360             : 
    3361         487 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3362         487 :     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
    3363         487 :     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
    3364             :     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
    3365             : 
    3366         487 :     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
    3367             : 
    3368             :     MachineInstrBuilder MIB;
    3369         974 :     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
    3370         906 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
    3371         453 :         .add(MI.getOperand(0))
    3372             :         .addGlobalAddress(G);
    3373             :     } else {
    3374          68 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
    3375          34 :         .add(MI.getOperand(0))
    3376             :         .addGlobalAddress(G);
    3377             : 
    3378             :       // There is an additional imm operand for tcreturn, but it should be in the
    3379             :       // right place already.
    3380             :     }
    3381             : 
    3382        3177 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
    3383        2690 :       MIB.add(MI.getOperand(I));
    3384             : 
    3385         487 :     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3386         487 :     MI.eraseFromParent();
    3387             :     return BB;
    3388             :   }
    3389           0 :   default:
    3390           0 :     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    3391             :   }
    3392             : }
    3393             : 
    3394       26390 : bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
    3395       52780 :   return isTypeLegal(VT.getScalarType());
    3396             : }
    3397             : 
    3398        4327 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    3399             :   // This currently forces unfolding various combinations of fsub into fma with
    3400             :   // free fneg'd operands. As long as we have fast FMA (controlled by
    3401             :   // isFMAFasterThanFMulAndFAdd), we should perform these.
    3402             : 
    3403             :   // When fma is quarter rate, for f64 where add / sub are at best half rate,
    3404             :   // most of these combines appear to be cycle neutral but save on instruction
    3405             :   // count / code size.
    3406        4327 :   return true;
    3407             : }
    3408             : 
    3409       13297 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
    3410             :                                          EVT VT) const {
    3411       13297 :   if (!VT.isVector()) {
    3412       13222 :     return MVT::i1;
    3413             :   }
    3414         150 :   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
    3415             : }
    3416             : 
    3417      137011 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
    3418             :   // TODO: Should i16 be used always if legal? For now it would force VALU
    3419             :   // shifts.
    3420      137011 :   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
    3421             : }
    3422             : 
    3423             : // Answering this is somewhat tricky and depends on the specific device which
    3424             : // have different rates for fma or all f64 operations.
    3425             : //
    3426             : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
    3427             : // regardless of which device (although the number of cycles differs between
    3428             : // devices), so it is always profitable for f64.
    3429             : //
    3430             : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
    3431             : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
    3432             : // which we can always do even without fused FP ops since it returns the same
    3433             : // result as the separate operations and since it is always full
    3434             : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
    3435             : // however does not support denormals, so we do report fma as faster if we have
    3436             : // a fast fma device and require denormals.
    3437             : //
    3438       12141 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    3439       12141 :   VT = VT.getScalarType();
    3440             : 
    3441       12141 :   switch (VT.getSimpleVT().SimpleTy) {
    3442        9374 :   case MVT::f32: {
    3443             :     // This is as fast on some subtargets. However, we always have full rate f32
    3444             :     // mad available which returns the same result as the separate operations
    3445             :     // which we should prefer over fma. We can't use this if we want to support
    3446             :     // denormals, so only report this in these cases.
    3447        9374 :     if (Subtarget->hasFP32Denormals())
    3448         591 :       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
    3449             : 
    3450             :     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
    3451        8783 :     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
    3452             :   }
    3453             :   case MVT::f64:
    3454             :     return true;
    3455        1739 :   case MVT::f16:
    3456        1739 :     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
    3457             :   default:
    3458             :     break;
    3459             :   }
    3460             : 
    3461           0 :   return false;
    3462             : }
    3463             : 
    3464             : //===----------------------------------------------------------------------===//
    3465             : // Custom DAG Lowering Operations
    3466             : //===----------------------------------------------------------------------===//
    3467             : 
    3468             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3469             : // wider vector type is legal.
    3470           6 : SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
    3471             :                                              SelectionDAG &DAG) const {
    3472             :   unsigned Opc = Op.getOpcode();
    3473           6 :   EVT VT = Op.getValueType();
    3474             :   assert(VT == MVT::v4f16);
    3475             : 
    3476             :   SDValue Lo, Hi;
    3477          12 :   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3478             : 
    3479             :   SDLoc SL(Op);
    3480             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
    3481          12 :                              Op->getFlags());
    3482             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
    3483          12 :                              Op->getFlags());
    3484             : 
    3485          18 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3486             : }
    3487             : 
    3488             : // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
    3489             : // wider vector type is legal.
    3490          86 : SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
    3491             :                                               SelectionDAG &DAG) const {
    3492             :   unsigned Opc = Op.getOpcode();
    3493          86 :   EVT VT = Op.getValueType();
    3494             :   assert(VT == MVT::v4i16 || VT == MVT::v4f16);
    3495             : 
    3496             :   SDValue Lo0, Hi0;
    3497         172 :   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
    3498             :   SDValue Lo1, Hi1;
    3499         172 :   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
    3500             : 
    3501             :   SDLoc SL(Op);
    3502             : 
    3503             :   SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
    3504         172 :                              Op->getFlags());
    3505             :   SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
    3506         172 :                              Op->getFlags());
    3507             : 
    3508         258 :   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
    3509             : }
    3510             : 
    3511      203425 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    3512      406850 :   switch (Op.getOpcode()) {
    3513       22881 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    3514        1625 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    3515       69106 :   case ISD::LOAD: {
    3516       69106 :     SDValue Result = LowerLOAD(Op, DAG);
    3517             :     assert((!Result.getNode() ||
    3518             :             Result.getNode()->getNumValues() == 2) &&
    3519             :            "Load should return a value and a chain");
    3520       69106 :     return Result;
    3521             :   }
    3522             : 
    3523          51 :   case ISD::FSIN:
    3524             :   case ISD::FCOS:
    3525          51 :     return LowerTrig(Op, DAG);
    3526         658 :   case ISD::SELECT: return LowerSELECT(Op, DAG);
    3527         237 :   case ISD::FDIV: return LowerFDIV(Op, DAG);
    3528         259 :   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
    3529       77919 :   case ISD::STORE: return LowerSTORE(Op, DAG);
    3530         883 :   case ISD::GlobalAddress: {
    3531         883 :     MachineFunction &MF = DAG.getMachineFunction();
    3532         883 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    3533         883 :     return LowerGlobalAddress(MFI, Op, DAG);
    3534             :   }
    3535       19341 :   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    3536        1106 :   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
    3537        2179 :   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
    3538          45 :   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
    3539         143 :   case ISD::INSERT_VECTOR_ELT:
    3540         143 :     return lowerINSERT_VECTOR_ELT(Op, DAG);
    3541        5217 :   case ISD::EXTRACT_VECTOR_ELT:
    3542        5217 :     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
    3543        1168 :   case ISD::BUILD_VECTOR:
    3544        1168 :     return lowerBUILD_VECTOR(Op, DAG);
    3545         479 :   case ISD::FP_ROUND:
    3546         479 :     return lowerFP_ROUND(Op, DAG);
    3547          27 :   case ISD::TRAP:
    3548          27 :     return lowerTRAP(Op, DAG);
    3549           9 :   case ISD::DEBUGTRAP:
    3550           9 :     return lowerDEBUGTRAP(Op, DAG);
    3551           6 :   case ISD::FABS:
    3552             :   case ISD::FNEG:
    3553           6 :     return splitUnaryVectorOp(Op, DAG);
    3554          86 :   case ISD::SHL:
    3555             :   case ISD::SRA:
    3556             :   case ISD::SRL:
    3557             :   case ISD::ADD:
    3558             :   case ISD::SUB:
    3559             :   case ISD::MUL:
    3560             :   case ISD::SMIN:
    3561             :   case ISD::SMAX:
    3562             :   case ISD::UMIN:
    3563             :   case ISD::UMAX:
    3564             :   case ISD::FMINNUM:
    3565             :   case ISD::FMAXNUM:
    3566             :   case ISD::FADD:
    3567             :   case ISD::FMUL:
    3568          86 :     return splitBinaryVectorOp(Op, DAG);
    3569             :   }
    3570             :   return SDValue();
    3571             : }
    3572             : 
    3573          33 : static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
    3574             :                                        const SDLoc &DL,
    3575             :                                        SelectionDAG &DAG, bool Unpacked) {
    3576          33 :   if (!LoadVT.isVector())
    3577           8 :     return Result;
    3578             : 
    3579          25 :   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
    3580             :     // Truncate to v2i16/v4i16.
    3581          11 :     EVT IntLoadVT = LoadVT.changeTypeToInteger();
    3582             : 
    3583             :     // Workaround legalizer not scalarizing truncate after vector op
    3584             :     // legalization byt not creating intermediate vector trunc.
    3585             :     SmallVector<SDValue, 4> Elts;
    3586          11 :     DAG.ExtractVectorElements(Result, Elts);
    3587          79 :     for (SDValue &Elt : Elts)
    3588          34 :       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
    3589             : 
    3590          11 :     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
    3591             : 
    3592             :     // Bitcast to original type (v2f16/v4f16).
    3593          11 :     return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3594             :   }
    3595             : 
    3596             :   // Cast back to the original packed type.
    3597          14 :   return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
    3598             : }
    3599             : 
    3600          18 : SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
    3601             :                                               MemSDNode *M,
    3602             :                                               SelectionDAG &DAG,
    3603             :                                               bool IsIntrinsic) const {
    3604             :   SDLoc DL(M);
    3605             :   SmallVector<SDValue, 10> Ops;
    3606          18 :   Ops.reserve(M->getNumOperands());
    3607             : 
    3608          36 :   Ops.push_back(M->getOperand(0));
    3609          18 :   if (IsIntrinsic)
    3610           0 :     Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
    3611             : 
    3612             :   // Skip 1, as it is the intrinsic ID.
    3613         288 :   for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
    3614         252 :     Ops.push_back(M->getOperand(I));
    3615             : 
    3616          18 :   bool Unpacked = Subtarget->hasUnpackedD16VMem();
    3617          36 :   EVT LoadVT = M->getValueType(0);
    3618             : 
    3619          18 :   EVT EquivLoadVT = LoadVT;
    3620          24 :   if (Unpacked && LoadVT.isVector()) {
    3621           4 :     EquivLoadVT = LoadVT.isVector() ?
    3622           4 :       EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    3623           8 :                        LoadVT.getVectorNumElements()) : LoadVT;
    3624             :   }
    3625             : 
    3626             :   // Change from v4f16/v2f16 to EquivLoadVT.
    3627          18 :   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
    3628             : 
    3629             :   SDValue Load
    3630             :     = DAG.getMemIntrinsicNode(
    3631             :       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
    3632             :       VTList, Ops, M->getMemoryVT(),
    3633          36 :       M->getMemOperand());
    3634          18 :   if (!Unpacked) // Just adjusted the opcode.
    3635          12 :     return Load;
    3636             : 
    3637           6 :   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
    3638             : 
    3639          12 :   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
    3640             : }
    3641             : 
    3642         272 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
    3643             :                                           SmallVectorImpl<SDValue> &Results,
    3644             :                                           SelectionDAG &DAG) const {
    3645         544 :   switch (N->getOpcode()) {
    3646             :   case ISD::INSERT_VECTOR_ELT: {
    3647          67 :     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
    3648          23 :       Results.push_back(Res);
    3649          67 :     return;
    3650             :   }
    3651             :   case ISD::EXTRACT_VECTOR_ELT: {
    3652           0 :     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
    3653           0 :       Results.push_back(Res);
    3654           0 :     return;
    3655             :   }
    3656          84 :   case ISD::INTRINSIC_WO_CHAIN: {
    3657         252 :     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    3658             :     switch (IID) {
    3659          28 :     case Intrinsic::amdgcn_cvt_pkrtz: {
    3660          28 :       SDValue Src0 = N->getOperand(1);
    3661          28 :       SDValue Src1 = N->getOperand(2);
    3662             :       SDLoc SL(N);
    3663             :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
    3664          28 :                                 Src0, Src1);
    3665          56 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
    3666             :       return;
    3667             :     }
    3668          56 :     case Intrinsic::amdgcn_cvt_pknorm_i16:
    3669             :     case Intrinsic::amdgcn_cvt_pknorm_u16:
    3670             :     case Intrinsic::amdgcn_cvt_pk_i16:
    3671             :     case Intrinsic::amdgcn_cvt_pk_u16: {
    3672          56 :       SDValue Src0 = N->getOperand(1);
    3673          56 :       SDValue Src1 = N->getOperand(2);
    3674             :       SDLoc SL(N);
    3675             :       unsigned Opcode;
    3676             : 
    3677          56 :       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
    3678             :         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    3679          38 :       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
    3680             :         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    3681          20 :       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
    3682             :         Opcode = AMDGPUISD::CVT_PK_I16_I32;
    3683             :       else
    3684             :         Opcode = AMDGPUISD::CVT_PK_U16_U32;
    3685             : 
    3686          56 :       SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
    3687         112 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
    3688             :       return;
    3689             :     }
    3690             :     }
    3691             :     break;
    3692             :   }
    3693             :   case ISD::INTRINSIC_W_CHAIN: {
    3694           0 :     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
    3695           0 :       Results.push_back(Res);
    3696           0 :       Results.push_back(Res.getValue(1));
    3697           0 :       return;
    3698             :     }
    3699             : 
    3700           0 :     break;
    3701             :   }
    3702             :   case ISD::SELECT: {
    3703             :     SDLoc SL(N);
    3704          52 :     EVT VT = N->getValueType(0);
    3705          26 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    3706          52 :     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
    3707          52 :     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
    3708             : 
    3709          26 :     EVT SelectVT = NewVT;
    3710          26 :     if (NewVT.bitsLT(MVT::i32)) {
    3711           2 :       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
    3712           2 :       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
    3713             :       SelectVT = MVT::i32;
    3714             :     }
    3715             : 
    3716             :     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
    3717          52 :                                     N->getOperand(0), LHS, RHS);
    3718             : 
    3719           0 :     if (NewVT != SelectVT)
    3720           2 :       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
    3721          52 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
    3722             :     return;
    3723             :   }
    3724             :   case ISD::FNEG: {
    3725           6 :     if (N->getValueType(0) != MVT::v2f16)
    3726             :       break;
    3727             : 
    3728             :     SDLoc SL(N);
    3729          10 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3730             : 
    3731             :     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
    3732             :                              BC,
    3733          10 :                              DAG.getConstant(0x80008000, SL, MVT::i32));
    3734          10 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3735             :     return;
    3736             :   }
    3737             :   case ISD::FABS: {
    3738           8 :     if (N->getValueType(0) != MVT::v2f16)
    3739             :       break;
    3740             : 
    3741             :     SDLoc SL(N);
    3742          12 :     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
    3743             : 
    3744             :     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
    3745             :                              BC,
    3746          12 :                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
    3747          12 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
    3748             :     return;
    3749             :   }
    3750             :   default:
    3751             :     break;
    3752             :   }
    3753             : }
    3754             : 
    3755             : /// Helper function for LowerBRCOND
    3756             : static SDNode *findUser(SDValue Value, unsigned Opcode) {
    3757             : 
    3758             :   SDNode *Parent = Value.getNode();
    3759         742 :   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    3760        1551 :        I != E; ++I) {
    3761             : 
    3762         809 :     if (I.getUse().get() != Value)
    3763             :       continue;
    3764             : 
    3765         739 :     if (I->getOpcode() == Opcode)
    3766             :       return *I;
    3767             :   }
    3768             :   return nullptr;
    3769             : }
    3770             : 
    3771        1625 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    3772        1625 :   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    3773        1398 :     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
    3774             :     case Intrinsic::amdgcn_if:
    3775             :       return AMDGPUISD::IF;
    3776          51 :     case Intrinsic::amdgcn_else:
    3777          51 :       return AMDGPUISD::ELSE;
    3778          61 :     case Intrinsic::amdgcn_loop:
    3779          61 :       return AMDGPUISD::LOOP;
    3780           0 :     case Intrinsic::amdgcn_end_cf:
    3781           0 :       llvm_unreachable("should not occur");
    3782           2 :     default:
    3783           2 :       return 0;
    3784             :     }
    3785             :   }
    3786             : 
    3787             :   // break, if_break, else_break are all only used as inputs to loop, not
    3788             :   // directly as branch conditions.
    3789             :   return 0;
    3790             : }
    3791             : 
    3792           4 : void SITargetLowering::createDebuggerPrologueStackObjects(
    3793             :     MachineFunction &MF) const {
    3794             :   // Create stack objects that are used for emitting debugger prologue.
    3795             :   //
    3796             :   // Debugger prologue writes work group IDs and work item IDs to scratch memory
    3797             :   // at fixed location in the following format:
    3798             :   //   offset 0:  work group ID x
    3799             :   //   offset 4:  work group ID y
    3800             :   //   offset 8:  work group ID z
    3801             :   //   offset 16: work item ID x
    3802             :   //   offset 20: work item ID y
    3803             :   //   offset 24: work item ID z
    3804           4 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3805             :   int ObjectIdx = 0;
    3806             : 
    3807             :   // For each dimension:
    3808          28 :   for (unsigned i = 0; i < 3; ++i) {
    3809             :     // Create fixed stack object for work group ID.
    3810          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
    3811             :     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
    3812             :     // Create fixed stack object for work item ID.
    3813          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
    3814             :     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
    3815             :   }
    3816           4 : }
    3817             : 
    3818        1147 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    3819        1147 :   const Triple &TT = getTargetMachine().getTargetTriple();
    3820        1044 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    3821        1250 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    3822        1250 :          AMDGPU::shouldEmitConstantsToTextSection(TT);
    3823             : }
    3824             : 
    3825         595 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    3826         541 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    3827         469 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    3828         126 :           GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    3829         786 :          !shouldEmitFixup(GV) &&
    3830         660 :          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
    3831             : }
    3832             : 
    3833         498 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
    3834         498 :   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
    3835             : }
    3836             : 
    3837             : /// This transforms the control flow intrinsics to get the branch destination as
    3838             : /// last parameter, also switches branch target with BR if the need arise
    3839        1625 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    3840             :                                       SelectionDAG &DAG) const {
    3841             :   SDLoc DL(BRCOND);
    3842             : 
    3843        1625 :   SDNode *Intr = BRCOND.getOperand(1).getNode();
    3844        1625 :   SDValue Target = BRCOND.getOperand(2);
    3845             :   SDNode *BR = nullptr;
    3846             :   SDNode *SetCC = nullptr;
    3847             : 
    3848        1625 :   if (Intr->getOpcode() == ISD::SETCC) {
    3849             :     // As long as we negate the condition everything is fine
    3850             :     SetCC = Intr;
    3851        1286 :     Intr = SetCC->getOperand(0).getNode();
    3852             : 
    3853             :   } else {
    3854             :     // Get the target from BR if we don't negate the condition
    3855             :     BR = findUser(BRCOND, ISD::BR);
    3856         339 :     Target = BR->getOperand(1);
    3857             :   }
    3858             : 
    3859             :   // FIXME: This changes the types of the intrinsics instead of introducing new
    3860             :   // nodes with the correct types.
    3861             :   // e.g. llvm.amdgcn.loop
    3862             : 
    3863             :   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
    3864             :   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
    3865             : 
    3866        1625 :   unsigned CFNode = isCFIntrinsic(Intr);
    3867        1625 :   if (CFNode == 0) {
    3868             :     // This is a uniform branch so we don't need to legalize.
    3869        1161 :     return BRCOND;
    3870             :   }
    3871             : 
    3872         928 :   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
    3873             :                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
    3874             : 
    3875             :   assert(!SetCC ||
    3876             :         (SetCC->getConstantOperandVal(1) == 1 &&
    3877             :          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    3878             :                                                              ISD::SETNE));
    3879             : 
    3880             :   // operands of the new intrinsic call
    3881             :   SmallVector<SDValue, 4> Ops;
    3882         464 :   if (HaveChain)
    3883         464 :     Ops.push_back(BRCOND.getOperand(0));
    3884             : 
    3885         928 :   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
    3886         464 :   Ops.push_back(Target);
    3887             : 
    3888         928 :   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
    3889             : 
    3890             :   // build the new intrinsic call
    3891         464 :   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
    3892             : 
    3893         464 :   if (!HaveChain) {
    3894             :     SDValue Ops[] =  {
    3895             :       SDValue(Result, 0),
    3896             :       BRCOND.getOperand(0)
    3897           0 :     };
    3898             : 
    3899           0 :     Result = DAG.getMergeValues(Ops, DL).getNode();
    3900             :   }
    3901             : 
    3902         464 :   if (BR) {
    3903             :     // Give the branch instruction our target
    3904             :     SDValue Ops[] = {
    3905          89 :       BR->getOperand(0),
    3906             :       BRCOND.getOperand(2)
    3907         178 :     };
    3908         178 :     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
    3909          89 :     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
    3910             :     BR = NewBR.getNode();
    3911             :   }
    3912             : 
    3913         928 :   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    3914             : 
    3915             :   // Copy the intrinsic results to registers
    3916        1331 :   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    3917             :     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    3918         403 :     if (!CopyToReg)
    3919           3 :       continue;
    3920             : 
    3921         400 :     Chain = DAG.getCopyToReg(
    3922             :       Chain, DL,
    3923         400 :       CopyToReg->getOperand(1),
    3924             :       SDValue(Result, i - 1),
    3925         800 :       SDValue());
    3926             : 
    3927         800 :     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    3928             :   }
    3929             : 
    3930             :   // Remove the old intrinsic from the chain
    3931         928 :   DAG.ReplaceAllUsesOfValueWith(
    3932         464 :     SDValue(Intr, Intr->getNumValues() - 1),
    3933         464 :     Intr->getOperand(0));
    3934             : 
    3935         464 :   return Chain;
    3936             : }
    3937             : 
    3938        2369 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
    3939             :                                             SDValue Op,
    3940             :                                             const SDLoc &DL,
    3941             :                                             EVT VT) const {
    3942        7107 :   return Op.getValueType().bitsLE(VT) ?
    3943        2369 :       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
    3944        7107 :       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
    3945             : }
    3946             : 
    3947         479 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
    3948             :   assert(Op.getValueType() == MVT::f16 &&
    3949             :          "Do not know how to custom lower FP_ROUND for non-f16 type");
    3950             : 
    3951         479 :   SDValue Src = Op.getOperand(0);
    3952             :   EVT SrcVT = Src.getValueType();
    3953             :   if (SrcVT != MVT::f64)
    3954         469 :     return Op;
    3955             : 
    3956             :   SDLoc DL(Op);
    3957             : 
    3958          10 :   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
    3959          10 :   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
    3960          10 :   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
    3961             : }
    3962             : 
    3963          27 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
    3964             :   SDLoc SL(Op);
    3965          27 :   SDValue Chain = Op.getOperand(0);
    3966             : 
    3967          39 :   if (Subtarget->getTrapHandlerAbi() != SISubtarget::TrapHandlerAbiHsa ||
    3968          12 :       !Subtarget->isTrapHandlerEnabled())
    3969          21 :     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
    3970             : 
    3971           6 :   MachineFunction &MF = DAG.getMachineFunction();
    3972           6 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3973             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    3974             :   assert(UserSGPR != AMDGPU::NoRegister);
    3975             :   SDValue QueuePtr = CreateLiveInRegister(
    3976          12 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    3977           6 :   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
    3978             :   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
    3979           6 :                                    QueuePtr, SDValue());
    3980             :   SDValue Ops[] = {
    3981             :     ToReg,
    3982             :     DAG.getTargetConstant(SISubtarget::TrapIDLLVMTrap, SL, MVT::i16),
    3983             :     SGPR01,
    3984             :     ToReg.getValue(1)
    3985          12 :   };
    3986           6 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    3987             : }
    3988             : 
    3989           9 : SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
    3990             :   SDLoc SL(Op);
    3991           9 :   SDValue Chain = Op.getOperand(0);
    3992           9 :   MachineFunction &MF = DAG.getMachineFunction();
    3993             : 
    3994          13 :   if (Subtarget->getTrapHandlerAbi() != SISubtarget::TrapHandlerAbiHsa ||
    3995           4 :       !Subtarget->isTrapHandlerEnabled()) {
    3996             :     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
    3997             :                                      "debugtrap handler not supported",
    3998             :                                      Op.getDebugLoc(),
    3999          14 :                                      DS_Warning);
    4000           7 :     LLVMContext &Ctx = MF.getFunction().getContext();
    4001           7 :     Ctx.diagnose(NoTrap);
    4002           7 :     return Chain;
    4003             :   }
    4004             : 
    4005             :   SDValue Ops[] = {
    4006             :     Chain,
    4007             :     DAG.getTargetConstant(SISubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
    4008           4 :   };
    4009           2 :   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    4010             : }
    4011             : 
    4012          32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
    4013             :                                              SelectionDAG &DAG) const {
    4014             :   // FIXME: Use inline constants (src_{shared, private}_base) instead.
    4015          32 :   if (Subtarget->hasApertureRegs()) {
    4016          12 :     unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4017             :         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
    4018             :         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
    4019             :     unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
    4020             :         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
    4021             :         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
    4022          12 :     unsigned Encoding =
    4023             :         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
    4024          12 :         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
    4025             :         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
    4026             : 
    4027          24 :     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
    4028             :     SDValue ApertureReg = SDValue(
    4029          24 :         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
    4030          12 :     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
    4031          12 :     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
    4032             :   }
    4033             : 
    4034          20 :   MachineFunction &MF = DAG.getMachineFunction();
    4035          20 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    4036             :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    4037             :   assert(UserSGPR != AMDGPU::NoRegister);
    4038             : 
    4039             :   SDValue QueuePtr = CreateLiveInRegister(
    4040          40 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    4041             : 
    4042             :   // Offset into amd_queue_t for group_segment_aperture_base_hi /
    4043             :   // private_segment_aperture_base_hi.
    4044          20 :   uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
    4045             : 
    4046          20 :   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
    4047             : 
    4048             :   // TODO: Use custom target PseudoSourceValue.
    4049             :   // TODO: We should use the value from the IR intrinsic call, but it might not
    4050             :   // be available and how do we get it?
    4051          20 :   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
    4052          20 :                                               AMDGPUASI.CONSTANT_ADDRESS));
    4053             : 
    4054             :   MachinePointerInfo PtrInfo(V, StructOffset);
    4055             :   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
    4056             :                      MinAlign(64, StructOffset),
    4057             :                      MachineMemOperand::MODereferenceable |
    4058          40 :                          MachineMemOperand::MOInvariant);
    4059             : }
    4060             : 
    4061          45 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
    4062             :                                              SelectionDAG &DAG) const {
    4063             :   SDLoc SL(Op);
    4064             :   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
    4065             : 
    4066          45 :   SDValue Src = ASC->getOperand(0);
    4067          45 :   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
    4068             : 
    4069             :   const AMDGPUTargetMachine &TM =
    4070          45 :     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
    4071             : 
    4072             :   // flat -> local/private
    4073          45 :   if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4074          12 :     unsigned DestAS = ASC->getDestAddressSpace();
    4075             : 
    4076          17 :     if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
    4077           5 :         DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4078             :       unsigned NullVal = TM.getNullPointerValue(DestAS);
    4079          12 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4080          12 :       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
    4081          12 :       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
    4082             : 
    4083             :       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
    4084          12 :                          NonNull, Ptr, SegmentNullPtr);
    4085             :     }
    4086             :   }
    4087             : 
    4088             :   // local/private -> flat
    4089          33 :   if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    4090             :     unsigned SrcAS = ASC->getSrcAddressSpace();
    4091             : 
    4092          54 :     if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
    4093          22 :         SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
    4094             :       unsigned NullVal = TM.getNullPointerValue(SrcAS);
    4095          32 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    4096             : 
    4097             :       SDValue NonNull
    4098          32 :         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
    4099             : 
    4100          32 :       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
    4101             :       SDValue CvtPtr
    4102          32 :         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
    4103             : 
    4104             :       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
    4105             :                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
    4106          64 :                          FlatNullPtr);
    4107             :     }
    4108             :   }
    4109             : 
    4110             :   // global <-> flat are no-ops and never emitted.
    4111             : 
    4112           1 :   const MachineFunction &MF = DAG.getMachineFunction();
    4113             :   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
    4114           2 :     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
    4115           1 :   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
    4116             : 
    4117           2 :   return DAG.getUNDEF(ASC->getValueType(0));
    4118             : }
    4119             : 
    4120         210 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    4121             :                                                  SelectionDAG &DAG) const {
    4122         210 :   SDValue Vec = Op.getOperand(0);
    4123         210 :   SDValue InsVal = Op.getOperand(1);
    4124         210 :   SDValue Idx = Op.getOperand(2);
    4125         210 :   EVT VecVT = Vec.getValueType();
    4126         210 :   EVT EltVT = VecVT.getVectorElementType();
    4127         210 :   unsigned VecSize = VecVT.getSizeInBits();
    4128         210 :   unsigned EltSize = EltVT.getSizeInBits();
    4129             : 
    4130             : 
    4131             :   assert(VecSize <= 64);
    4132             : 
    4133         210 :   unsigned NumElts = VecVT.getVectorNumElements();
    4134             :   SDLoc SL(Op);
    4135             :   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
    4136             : 
    4137         210 :   if (NumElts == 4 && EltSize == 16 && KIdx) {
    4138          20 :     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
    4139             : 
    4140             :     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4141          40 :                                  DAG.getConstant(0, SL, MVT::i32));
    4142             :     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
    4143          40 :                                  DAG.getConstant(1, SL, MVT::i32));
    4144             : 
    4145          20 :     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
    4146          20 :     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
    4147             : 
    4148          40 :     unsigned Idx = KIdx->getZExtValue();
    4149             :     bool InsertLo = Idx < 2;
    4150             :     SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
    4151          20 :       InsertLo ? LoVec : HiVec,
    4152             :       DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
    4153          60 :       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
    4154             : 
    4155          20 :     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
    4156             : 
    4157             :     SDValue Concat = InsertLo ?
    4158          27 :       DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
    4159          40 :       DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
    4160             : 
    4161          20 :     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
    4162             :   }
    4163             : 
    4164             :   if (isa<ConstantSDNode>(Idx))
    4165         160 :     return SDValue();
    4166             : 
    4167          30 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4168             : 
    4169             :   // Avoid stack access for dynamic indexing.
    4170          30 :   SDValue Val = InsVal;
    4171             :   if (InsVal.getValueType() == MVT::f16)
    4172           6 :       Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
    4173             : 
    4174             :   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
    4175          30 :   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
    4176             : 
    4177             :   assert(isPowerOf2_32(EltSize));
    4178          30 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4179             : 
    4180             :   // Convert vector index to bit-index.
    4181          30 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4182             : 
    4183          30 :   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4184             :   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
    4185             :                             DAG.getConstant(0xffff, SL, IntVT),
    4186          60 :                             ScaledIdx);
    4187             : 
    4188          30 :   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
    4189             :   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
    4190          60 :                             DAG.getNOT(SL, BFM, IntVT), BCVec);
    4191             : 
    4192          30 :   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
    4193          30 :   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
    4194             : }
    4195             : 
    4196        5217 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    4197             :                                                   SelectionDAG &DAG) const {
    4198             :   SDLoc SL(Op);
    4199             : 
    4200        5217 :   EVT ResultVT = Op.getValueType();
    4201        5217 :   SDValue Vec = Op.getOperand(0);
    4202        5217 :   SDValue Idx = Op.getOperand(1);
    4203        5217 :   EVT VecVT = Vec.getValueType();
    4204        5217 :   unsigned VecSize = VecVT.getSizeInBits();
    4205        5217 :   EVT EltVT = VecVT.getVectorElementType();
    4206             :   assert(VecSize <= 64);
    4207             : 
    4208             :   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
    4209             : 
    4210             :   // Make sure we do any optimizations that will make it easier to fold
    4211             :   // source modifiers before obscuring it with bit operations.
    4212             : 
    4213             :   // XXX - Why doesn't this get called when vector_shuffle is expanded?
    4214        5217 :   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
    4215           7 :     return Combined;
    4216             : 
    4217        5210 :   unsigned EltSize = EltVT.getSizeInBits();
    4218             :   assert(isPowerOf2_32(EltSize));
    4219             : 
    4220        5210 :   MVT IntVT = MVT::getIntegerVT(VecSize);
    4221        5210 :   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
    4222             : 
    4223             :   // Convert vector index to bit-index (* EltSize)
    4224        5210 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
    4225             : 
    4226        5210 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
    4227        5210 :   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
    4228             : 
    4229             :   if (ResultVT == MVT::f16) {
    4230        1226 :     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
    4231        1226 :     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    4232             :   }
    4233             : 
    4234        3984 :   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
    4235             : }
    4236             : 
    4237        1168 : SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
    4238             :                                             SelectionDAG &DAG) const {
    4239             :   SDLoc SL(Op);
    4240        1168 :   EVT VT = Op.getValueType();
    4241             : 
    4242             :   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
    4243         628 :     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
    4244             : 
    4245             :     // Turn into pair of packed build_vectors.
    4246             :     // TODO: Special case for constants that can be materialized with s_mov_b64.
    4247             :     SDValue Lo = DAG.getBuildVector(HalfVT, SL,
    4248         628 :                                     { Op.getOperand(0), Op.getOperand(1) });
    4249             :     SDValue Hi = DAG.getBuildVector(HalfVT, SL,
    4250         628 :                                     { Op.getOperand(2), Op.getOperand(3) });
    4251             : 
    4252         314 :     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
    4253         314 :     SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
    4254             : 
    4255         628 :     SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
    4256         314 :     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
    4257             :   }
    4258             : 
    4259             :   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
    4260             : 
    4261         854 :   SDValue Lo = Op.getOperand(0);
    4262         854 :   SDValue Hi = Op.getOperand(1);
    4263             : 
    4264         854 :   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
    4265         854 :   Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
    4266             : 
    4267         854 :   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
    4268         854 :   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
    4269             : 
    4270             :   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
    4271        1708 :                               DAG.getConstant(16, SL, MVT::i32));
    4272             : 
    4273         854 :   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
    4274             : 
    4275         854 :   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
    4276             : }
    4277             : 
    4278             : bool
    4279        1688 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    4280             :   // We can fold offsets for anything that doesn't require a GOT relocation.
    4281        3348 :   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    4282        3251 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
    4283        3376 :           GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
    4284        1785 :          !shouldEmitGOTReloc(GA->getGlobal());
    4285             : }
    4286             : 
    4287             : static SDValue
    4288         523 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
    4289             :                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
    4290             :                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
    4291             :   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
    4292             :   // lowered to the following code sequence:
    4293             :   //
    4294             :   // For constant address space:
    4295             :   //   s_getpc_b64 s[0:1]
    4296             :   //   s_add_u32 s0, s0, $symbol
    4297             :   //   s_addc_u32 s1, s1, 0
    4298             :   //
    4299             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4300             :   //   a fixup or relocation is emitted to replace $symbol with a literal
    4301             :   //   constant, which is a pc-relative offset from the encoding of the $symbol
    4302             :   //   operand to the global variable.
    4303             :   //
    4304             :   // For global address space:
    4305             :   //   s_getpc_b64 s[0:1]
    4306             :   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
    4307             :   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
    4308             :   //
    4309             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    4310             :   //   fixups or relocations are emitted to replace $symbol@*@lo and
    4311             :   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
    4312             :   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
    4313             :   //   operand to the global variable.
    4314             :   //
    4315             :   // What we want here is an offset from the value returned by s_getpc
    4316             :   // (which is the address of the s_add_u32 instruction) to the global
    4317             :   // variable, but since the encoding of $symbol starts 4 bytes after the start
    4318             :   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
    4319             :   // small. This requires us to add 4 to the global variable offset in order to
    4320             :   // compute the correct address.
    4321         523 :   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4322        1569 :                                              GAFlags);
    4323             :   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    4324             :                                              GAFlags == SIInstrInfo::MO_NONE ?
    4325        1046 :                                              GAFlags : GAFlags + 1);
    4326         523 :   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
    4327             : }
    4328             : 
    4329         883 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    4330             :                                              SDValue Op,
    4331             :                                              SelectionDAG &DAG) const {
    4332             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    4333         883 :   const GlobalValue *GV = GSD->getGlobal();
    4334             : 
    4335        1738 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
    4336        1710 :       GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
    4337        2567 :       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
    4338             :       // FIXME: It isn't correct to rely on the type of the pointer. This should
    4339             :       // be removed when address space 0 is 64-bit.
    4340         829 :       !GV->getType()->getElementType()->isFunctionTy())
    4341         360 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    4342             : 
    4343             :   SDLoc DL(GSD);
    4344        1046 :   EVT PtrVT = Op.getValueType();
    4345             : 
    4346         523 :   if (shouldEmitFixup(GV))
    4347          25 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    4348         498 :   else if (shouldEmitPCReloc(GV))
    4349         473 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
    4350         473 :                                    SIInstrInfo::MO_REL32);
    4351             : 
    4352             :   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
    4353          25 :                                             SIInstrInfo::MO_GOTPCREL32);
    4354             : 
    4355          25 :   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
    4356          25 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    4357          25 :   const DataLayout &DataLayout = DAG.getDataLayout();
    4358          25 :   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
    4359             :   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
    4360          25 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    4361             : 
    4362             :   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
    4363             :                      MachineMemOperand::MODereferenceable |
    4364          25 :                          MachineMemOperand::MOInvariant);
    4365             : }
    4366             : 
    4367        8925 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
    4368             :                                    const SDLoc &DL, SDValue V) const {
    4369             :   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
    4370             :   // the destination register.
    4371             :   //
    4372             :   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
    4373             :   // so we will end up with redundant moves to m0.
    4374             :   //
    4375             :   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
    4376             : 
    4377             :   // A Null SDValue creates a glue result.
    4378        8925 :   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
    4379        8925 :                                   V, Chain);
    4380        8925 :   return SDValue(M0, 0);
    4381             : }
    4382             : 
    4383          91 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
    4384             :                                                  SDValue Op,
    4385             :                                                  MVT VT,
    4386             :                                                  unsigned Offset) const {
    4387             :   SDLoc SL(Op);
    4388             :   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
    4389         182 :                                            DAG.getEntryNode(), Offset, 4, false);
    4390             :   // The local size values will have the hi 16-bits as zero.
    4391             :   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
    4392         273 :                      DAG.getValueType(VT));
    4393             : }
    4394             : 
    4395           2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4396             :                                         EVT VT) {
    4397           2 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4398             :                                       "non-hsa intrinsic with hsa target",
    4399           6 :                                       DL.getDebugLoc());
    4400           2 :   DAG.getContext()->diagnose(BadIntrin);
    4401           2 :   return DAG.getUNDEF(VT);
    4402             : }
    4403             : 
    4404           5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    4405             :                                          EVT VT) {
    4406           5 :   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
    4407             :                                       "intrinsic not supported on subtarget",
    4408          15 :                                       DL.getDebugLoc());
    4409           5 :   DAG.getContext()->diagnose(BadIntrin);
    4410           5 :   return DAG.getUNDEF(VT);
    4411             : }
    4412             : 
    4413         652 : static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
    4414             :                                     ArrayRef<SDValue> Elts) {
    4415             :   assert(!Elts.empty());
    4416             :   MVT Type;
    4417             :   unsigned NumElts;
    4418             : 
    4419         652 :   if (Elts.size() == 1) {
    4420             :     Type = MVT::f32;
    4421             :     NumElts = 1;
    4422         435 :   } else if (Elts.size() == 2) {
    4423             :     Type = MVT::v2f32;
    4424             :     NumElts = 2;
    4425         310 :   } else if (Elts.size() <= 4) {
    4426             :     Type = MVT::v4f32;
    4427             :     NumElts = 4;
    4428          96 :   } else if (Elts.size() <= 8) {
    4429             :     Type = MVT::v8f32;
    4430             :     NumElts = 8;
    4431             :   } else {
    4432             :     assert(Elts.size() <= 16);
    4433             :     Type = MVT::v16f32;
    4434             :     NumElts = 16;
    4435             :   }
    4436             : 
    4437        1304 :   SmallVector<SDValue, 16> VecElts(NumElts);
    4438        4248 :   for (unsigned i = 0; i < Elts.size(); ++i) {
    4439        3596 :     SDValue Elt = Elts[i];
    4440             :     if (Elt.getValueType() != MVT::f32)
    4441         575 :       Elt = DAG.getBitcast(MVT::f32, Elt);
    4442        1798 :     VecElts[i] = Elt;
    4443             :   }
    4444        1009 :   for (unsigned i = Elts.size(); i < NumElts; ++i)
    4445         714 :     VecElts[i] = DAG.getUNDEF(MVT::f32);
    4446             : 
    4447         652 :   if (NumElts == 1)
    4448         217 :     return VecElts[0];
    4449         435 :   return DAG.getBuildVector(Type, DL, VecElts);
    4450             : }
    4451             : 
    4452         652 : static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
    4453             :                              SDValue *GLC, SDValue *SLC) {
    4454             :   auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
    4455             :   if (!CachePolicyConst)
    4456             :     return false;
    4457             : 
    4458         652 :   uint64_t Value = CachePolicyConst->getZExtValue();
    4459             :   SDLoc DL(CachePolicy);
    4460         652 :   if (GLC) {
    4461        1220 :     *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
    4462         610 :     Value &= ~(uint64_t)0x1;
    4463             :   }
    4464         652 :   if (SLC) {
    4465        1304 :     *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
    4466         652 :     Value &= ~(uint64_t)0x2;
    4467             :   }
    4468             : 
    4469         652 :   return Value == 0;
    4470             : }
    4471             : 
    4472         659 : SDValue SITargetLowering::lowerImage(SDValue Op,
    4473             :                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
    4474             :                                      SelectionDAG &DAG) const {
    4475             :   SDLoc DL(Op);
    4476         659 :   MachineFunction &MF = DAG.getMachineFunction();
    4477         659 :   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
    4478         659 :       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
    4479         659 :   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
    4480             : 
    4481         659 :   SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
    4482             :   bool IsD16 = false;
    4483         659 :   SDValue VData;
    4484             :   int NumVDataDwords;
    4485             :   unsigned AddrIdx; // Index of first address argument
    4486             :   unsigned DMask;
    4487             : 
    4488         659 :   if (BaseOpcode->Atomic) {
    4489          42 :     VData = Op.getOperand(2);
    4490             : 
    4491          42 :     bool Is64Bit = VData.getValueType() == MVT::i64;
    4492          42 :     if (BaseOpcode->AtomicX2) {
    4493           2 :       SDValue VData2 = Op.getOperand(3);
    4494           2 :       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
    4495           6 :                                  {VData, VData2});
    4496           2 :       if (Is64Bit)
    4497           0 :         VData = DAG.getBitcast(MVT::v4i32, VData);
    4498             : 
    4499           2 :       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
    4500           2 :       DMask = Is64Bit ? 0xf : 0x3;
    4501           2 :       NumVDataDwords = Is64Bit ? 4 : 2;
    4502             :       AddrIdx = 4;
    4503             :     } else {
    4504          40 :       DMask = Is64Bit ? 0x3 : 0x1;
    4505          40 :       NumVDataDwords = Is64Bit ? 2 : 1;
    4506             :       AddrIdx = 3;
    4507             :     }
    4508             :   } else {
    4509             :     unsigned DMaskIdx;
    4510             : 
    4511         617 :     if (BaseOpcode->Store) {
    4512          87 :       VData = Op.getOperand(2);
    4513             : 
    4514          87 :       MVT StoreVT = VData.getSimpleValueType();
    4515          87 :       if (StoreVT.getScalarType() == MVT::f16) {
    4516          24 :         if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS ||
    4517          12 :             !BaseOpcode->HasD16)
    4518           0 :           return Op; // D16 is unsupported for this instruction
    4519             : 
    4520             :         IsD16 = true;
    4521          12 :         VData = handleD16VData(VData, DAG);
    4522             :       }
    4523             : 
    4524         174 :       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
    4525             :       DMaskIdx = 3;
    4526             :     } else {
    4527         530 :       MVT LoadVT = Op.getSimpleValueType();
    4528         530 :       if (LoadVT.getScalarType() == MVT::f16) {
    4529          54 :         if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS ||
    4530          27 :             !BaseOpcode->HasD16)
    4531           0 :           return Op; // D16 is unsupported for this instruction
    4532             : 
    4533             :         IsD16 = true;
    4534          48 :         if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
    4535          14 :           ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
    4536             :       }
    4537             : 
    4538         530 :       NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
    4539         530 :       DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
    4540             :     }
    4541             : 
    4542             :     auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
    4543             :     if (!DMaskConst)
    4544           0 :       return Op;
    4545             : 
    4546         617 :     AddrIdx = DMaskIdx + 1;
    4547        1234 :     DMask = DMaskConst->getZExtValue();
    4548         617 :     if (!DMask && !BaseOpcode->Store) {
    4549             :       // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
    4550             :       // store the channels' default values.
    4551           7 :       SDValue Undef = DAG.getUNDEF(Op.getValueType());
    4552           7 :       if (isa<MemSDNode>(Op))
    4553           6 :         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
    4554           5 :       return Undef;
    4555             :     }
    4556             :   }
    4557             : 
    4558        1956 :   unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
    4559        1304 :                        (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
    4560        1304 :                        (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
    4561         652 :                        (BaseOpcode->LodOrClampOrMip ? 1 : 0);
    4562             :   SmallVector<SDValue, 4> VAddrs;
    4563        4248 :   for (unsigned i = 0; i < NumVAddrs; ++i)
    4564        3596 :     VAddrs.push_back(Op.getOperand(AddrIdx + i));
    4565        1304 :   SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
    4566             : 
    4567         652 :   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
    4568         652 :   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
    4569             :   unsigned CtrlIdx; // Index of texfailctrl argument
    4570         652 :   SDValue Unorm;
    4571         652 :   if (!BaseOpcode->Sampler) {
    4572         239 :     Unorm = True;
    4573         239 :     CtrlIdx = AddrIdx + NumVAddrs + 1;
    4574             :   } else {
    4575             :     auto UnormConst =
    4576         413 :         dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
    4577             :     if (!UnormConst)
    4578           0 :       return Op;
    4579             : 
    4580         826 :     Unorm = UnormConst->getZExtValue() ? True : False;
    4581         413 :     CtrlIdx = AddrIdx + NumVAddrs + 3;
    4582             :   }
    4583             : 
    4584         652 :   SDValue TexFail = Op.getOperand(CtrlIdx);
    4585             :   auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
    4586        1304 :   if (!TexFailConst || TexFailConst->getZExtValue() != 0)
    4587           0 :     return Op;
    4588             : 
    4589         652 :   SDValue GLC;
    4590         652 :   SDValue SLC;
    4591         652 :   if (BaseOpcode->Atomic) {
    4592          42 :     GLC = True; // TODO no-return optimization
    4593          84 :     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
    4594           0 :       return Op;
    4595             :   } else {
    4596        1220 :     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
    4597           0 :       return Op;
    4598             :   }
    4599             : 
    4600             :   SmallVector<SDValue, 14> Ops;
    4601         652 :   if (BaseOpcode->Store || BaseOpcode->Atomic)
    4602         129 :     Ops.push_back(VData); // vdata
    4603         652 :   Ops.push_back(VAddr);
    4604        1304 :   Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
    4605         652 :   if (BaseOpcode->Sampler)
    4606         826 :     Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
    4607        1304 :   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
    4608         652 :   Ops.push_back(Unorm);
    4609         652 :   Ops.push_back(GLC);
    4610         652 :   Ops.push_back(SLC);
    4611         652 :   Ops.push_back(False); // r128
    4612         652 :   Ops.push_back(False); // tfe
    4613         652 :   Ops.push_back(False); // lwe
    4614         652 :   Ops.push_back(DimInfo->DA ? True : False);
    4615         652 :   if (BaseOpcode->HasD16)
    4616         579 :     Ops.push_back(IsD16 ? True : False);
    4617         652 :   if (isa<MemSDNode>(Op))
    4618         621 :     Ops.push_back(Op.getOperand(0)); // chain
    4619             : 
    4620        1304 :   int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
    4621             :   int Opcode = -1;
    4622             : 
    4623         652 :   if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4624         370 :     Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
    4625             :                                    NumVDataDwords, NumVAddrDwords);
    4626         370 :   if (Opcode == -1)
    4627         631 :     Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
    4628             :                                    NumVDataDwords, NumVAddrDwords);
    4629             :   assert(Opcode != -1);
    4630             : 
    4631         652 :   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
    4632             :   if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
    4633         621 :     MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
    4634         621 :     *MemRefs = MemOp->getMemOperand();
    4635         621 :     NewNode->setMemRefs(MemRefs, MemRefs + 1);
    4636             :   }
    4637             : 
    4638         652 :   if (BaseOpcode->AtomicX2) {
    4639             :     SmallVector<SDValue, 1> Elt;
    4640           2 :     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
    4641           4 :     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
    4642         650 :   } else if (IsD16 && !BaseOpcode->Store) {
    4643             :     MVT LoadVT = Op.getSimpleValueType();
    4644             :     SDValue Adjusted = adjustLoadValueTypeImpl(
    4645          54 :         SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
    4646          54 :     return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
    4647             :   }
    4648             : 
    4649         623 :   return SDValue(NewNode, 0);
    4650             : }
    4651             : 
    4652       19341 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    4653             :                                                   SelectionDAG &DAG) const {
    4654       19341 :   MachineFunction &MF = DAG.getMachineFunction();
    4655       19341 :   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    4656             : 
    4657       19341 :   EVT VT = Op.getValueType();
    4658             :   SDLoc DL(Op);
    4659       38682 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    4660             : 
    4661             :   // TODO: Should this propagate fast-math-flags?
    4662             : 
    4663       19341 :   switch (IntrinsicID) {
    4664           4 :   case Intrinsic::amdgcn_implicit_buffer_ptr: {
    4665           4 :     if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
    4666           2 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4667             :     return getPreloadedValue(DAG, *MFI, VT,
    4668           2 :                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
    4669             :   }
    4670          54 :   case Intrinsic::amdgcn_dispatch_ptr:
    4671             :   case Intrinsic::amdgcn_queue_ptr: {
    4672          54 :     if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
    4673             :       DiagnosticInfoUnsupported BadIntrin(
    4674             :           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
    4675           4 :           DL.getDebugLoc());
    4676           2 :       DAG.getContext()->diagnose(BadIntrin);
    4677           2 :       return DAG.getUNDEF(VT);
    4678             :     }
    4679             : 
    4680          52 :     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
    4681             :       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
    4682          52 :     return getPreloadedValue(DAG, *MFI, VT, RegID);
    4683             :   }
    4684          40 :   case Intrinsic::amdgcn_implicitarg_ptr: {
    4685          40 :     if (MFI->isEntryFunction())
    4686          32 :       return getImplicitArgPtr(DAG, DL);
    4687             :     return getPreloadedValue(DAG, *MFI, VT,
    4688           8 :                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    4689             :   }
    4690       12887 :   case Intrinsic::amdgcn_kernarg_segment_ptr: {
    4691             :     return getPreloadedValue(DAG, *MFI, VT,
    4692       12887 :                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    4693             :   }
    4694           9 :   case Intrinsic::amdgcn_dispatch_id: {
    4695           9 :     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
    4696             :   }
    4697             :   case Intrinsic::amdgcn_rcp:
    4698          20 :     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    4699             :   case Intrinsic::amdgcn_rsq:
    4700          32 :     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4701           5 :   case Intrinsic::amdgcn_rsq_legacy:
    4702           5 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4703           1 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4704             : 
    4705           4 :     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    4706          11 :   case Intrinsic::amdgcn_rcp_legacy:
    4707          11 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4708           4 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    4709           7 :     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
    4710           6 :   case Intrinsic::amdgcn_rsq_clamp: {
    4711           6 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4712           3 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    4713             : 
    4714           3 :     Type *Type = VT.getTypeForEVT(*DAG.getContext());
    4715           3 :     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    4716           3 :     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    4717             : 
    4718           3 :     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    4719             :     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    4720           3 :                               DAG.getConstantFP(Max, DL, VT));
    4721             :     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    4722           3 :                        DAG.getConstantFP(Min, DL, VT));
    4723             :   }
    4724           2 :   case Intrinsic::r600_read_ngroups_x:
    4725           4 :     if (Subtarget->isAmdHsaOS())
    4726           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4727             : 
    4728             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4729           2 :                                     SI::KernelInputOffsets::NGROUPS_X, 4, false);
    4730           2 :   case Intrinsic::r600_read_ngroups_y:
    4731           4 :     if (Subtarget->isAmdHsaOS())
    4732           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4733             : 
    4734             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4735           2 :                                     SI::KernelInputOffsets::NGROUPS_Y, 4, false);
    4736           2 :   case Intrinsic::r600_read_ngroups_z:
    4737           4 :     if (Subtarget->isAmdHsaOS())
    4738           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4739             : 
    4740             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4741           2 :                                     SI::KernelInputOffsets::NGROUPS_Z, 4, false);
    4742           2 :   case Intrinsic::r600_read_global_size_x:
    4743           4 :     if (Subtarget->isAmdHsaOS())
    4744           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4745             : 
    4746             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4747           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
    4748           2 :   case Intrinsic::r600_read_global_size_y:
    4749           4 :     if (Subtarget->isAmdHsaOS())
    4750           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4751             : 
    4752             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4753           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
    4754           2 :   case Intrinsic::r600_read_global_size_z:
    4755           4 :     if (Subtarget->isAmdHsaOS())
    4756           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4757             : 
    4758             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    4759           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
    4760          13 :   case Intrinsic::r600_read_local_size_x:
    4761          26 :     if (Subtarget->isAmdHsaOS())
    4762           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4763             : 
    4764             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4765          13 :                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
    4766          39 :   case Intrinsic::r600_read_local_size_y:
    4767          78 :     if (Subtarget->isAmdHsaOS())
    4768           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4769             : 
    4770             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4771          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
    4772          39 :   case Intrinsic::r600_read_local_size_z:
    4773          78 :     if (Subtarget->isAmdHsaOS())
    4774           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    4775             : 
    4776             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    4777          39 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
    4778          49 :   case Intrinsic::amdgcn_workgroup_id_x:
    4779             :   case Intrinsic::r600_read_tgid_x:
    4780             :     return getPreloadedValue(DAG, *MFI, VT,
    4781          49 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    4782          24 :   case Intrinsic::amdgcn_workgroup_id_y:
    4783             :   case Intrinsic::r600_read_tgid_y:
    4784             :     return getPreloadedValue(DAG, *MFI, VT,
    4785          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    4786          24 :   case Intrinsic::amdgcn_workgroup_id_z:
    4787             :   case Intrinsic::r600_read_tgid_z:
    4788             :     return getPreloadedValue(DAG, *MFI, VT,
    4789          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    4790        3014 :   case Intrinsic::amdgcn_workitem_id_x: {
    4791             :   case Intrinsic::r600_read_tidig_x:
    4792             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4793        3014 :                           SDLoc(DAG.getEntryNode()),
    4794        6028 :                           MFI->getArgInfo().WorkItemIDX);
    4795             :   }
    4796         125 :   case Intrinsic::amdgcn_workitem_id_y:
    4797             :   case Intrinsic::r600_read_tidig_y:
    4798             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4799         125 :                           SDLoc(DAG.getEntryNode()),
    4800         250 :                           MFI->getArgInfo().WorkItemIDY);
    4801          74 :   case Intrinsic::amdgcn_workitem_id_z:
    4802             :   case Intrinsic::r600_read_tidig_z:
    4803             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    4804          74 :                           SDLoc(DAG.getEntryNode()),
    4805         148 :                           MFI->getArgInfo().WorkItemIDZ);
    4806         448 :   case AMDGPUIntrinsic::SI_load_const: {
    4807             :     SDValue Ops[] = {
    4808             :       Op.getOperand(1),
    4809             :       Op.getOperand(2)
    4810         448 :     };
    4811             : 
    4812         896 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4813             :         MachinePointerInfo(),
    4814             :         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    4815             :             MachineMemOperand::MOInvariant,
    4816         448 :         VT.getStoreSize(), 4);
    4817             :     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
    4818         896 :                                    Op->getVTList(), Ops, VT, MMO);
    4819             :   }
    4820          33 :   case Intrinsic::amdgcn_fdiv_fast:
    4821          33 :     return lowerFDIV_FAST(Op, DAG);
    4822          84 :   case Intrinsic::amdgcn_interp_mov: {
    4823          84 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4824          84 :     SDValue Glue = M0.getValue(1);
    4825             :     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
    4826          84 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4827             :   }
    4828         211 :   case Intrinsic::amdgcn_interp_p1: {
    4829         211 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    4830         211 :     SDValue Glue = M0.getValue(1);
    4831             :     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
    4832         211 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    4833             :   }
    4834         195 :   case Intrinsic::amdgcn_interp_p2: {
    4835         195 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
    4836         195 :     SDValue Glue = SDValue(M0.getNode(), 1);
    4837             :     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
    4838             :                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
    4839         195 :                        Glue);
    4840             :   }
    4841             :   case Intrinsic::amdgcn_sin:
    4842           5 :     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
    4843             : 
    4844             :   case Intrinsic::amdgcn_cos:
    4845           3 :     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
    4846             : 
    4847           3 :   case Intrinsic::amdgcn_log_clamp: {
    4848           3 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4849           2 :       return SDValue();
    4850             : 
    4851             :     DiagnosticInfoUnsupported BadIntrin(
    4852             :       MF.getFunction(), "intrinsic not supported on subtarget",
    4853           2 :       DL.getDebugLoc());
    4854           1 :       DAG.getContext()->diagnose(BadIntrin);
    4855           1 :       return DAG.getUNDEF(VT);
    4856             :   }
    4857             :   case Intrinsic::amdgcn_ldexp:
    4858             :     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
    4859           9 :                        Op.getOperand(1), Op.getOperand(2));
    4860             : 
    4861             :   case Intrinsic::amdgcn_fract:
    4862           7 :     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    4863             : 
    4864             :   case Intrinsic::amdgcn_class:
    4865             :     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
    4866          61 :                        Op.getOperand(1), Op.getOperand(2));
    4867          10 :   case Intrinsic::amdgcn_div_fmas:
    4868             :     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    4869             :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    4870          10 :                        Op.getOperand(4));
    4871             : 
    4872             :   case Intrinsic::amdgcn_div_fixup:
    4873             :     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    4874          13 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4875             : 
    4876             :   case Intrinsic::amdgcn_trig_preop:
    4877             :     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    4878           4 :                        Op.getOperand(1), Op.getOperand(2));
    4879          27 :   case Intrinsic::amdgcn_div_scale: {
    4880             :     // 3rd parameter required to be a constant.
    4881             :     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4882             :     if (!Param)
    4883           9 :       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
    4884             : 
    4885             :     // Translate to the operands expected by the machine instruction. The
    4886             :     // first parameter must be the same as the first instruction.
    4887          24 :     SDValue Numerator = Op.getOperand(1);
    4888          24 :     SDValue Denominator = Op.getOperand(2);
    4889             : 
    4890             :     // Note this order is opposite of the machine instruction's operations,
    4891             :     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    4892             :     // intrinsic has the numerator as the first operand to match a normal
    4893             :     // division operation.
    4894             : 
    4895          48 :     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    4896             : 
    4897             :     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    4898          48 :                        Denominator, Numerator);
    4899             :   }
    4900          48 :   case Intrinsic::amdgcn_icmp: {
    4901             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4902             :     if (!CD)
    4903           6 :       return DAG.getUNDEF(VT);
    4904             : 
    4905          42 :     int CondCode = CD->getSExtValue();
    4906          42 :     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
    4907             :         CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
    4908           2 :       return DAG.getUNDEF(VT);
    4909             : 
    4910             :     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
    4911          40 :     ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
    4912             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4913          80 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4914             :   }
    4915          56 :   case Intrinsic::amdgcn_fcmp: {
    4916             :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4917             :     if (!CD)
    4918           2 :       return DAG.getUNDEF(VT);
    4919             : 
    4920          54 :     int CondCode = CD->getSExtValue();
    4921          54 :     if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
    4922             :         CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
    4923           2 :       return DAG.getUNDEF(VT);
    4924             : 
    4925             :     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
    4926          52 :     ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
    4927             :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4928         104 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4929             :   }
    4930             :   case Intrinsic::amdgcn_fmed3:
    4931             :     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
    4932          69 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4933             :   case Intrinsic::amdgcn_fmul_legacy:
    4934             :     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
    4935          31 :                        Op.getOperand(1), Op.getOperand(2));
    4936             :   case Intrinsic::amdgcn_sffbh:
    4937           4 :     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
    4938             :   case Intrinsic::amdgcn_sbfe:
    4939             :     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    4940         102 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4941             :   case Intrinsic::amdgcn_ubfe:
    4942             :     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    4943          94 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4944          95 :   case Intrinsic::amdgcn_cvt_pkrtz:
    4945             :   case Intrinsic::amdgcn_cvt_pknorm_i16:
    4946             :   case Intrinsic::amdgcn_cvt_pknorm_u16:
    4947             :   case Intrinsic::amdgcn_cvt_pk_i16:
    4948             :   case Intrinsic::amdgcn_cvt_pk_u16: {
    4949             :     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
    4950          95 :     EVT VT = Op.getValueType();
    4951             :     unsigned Opcode;
    4952             : 
    4953          95 :     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
    4954             :       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
    4955          56 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
    4956             :       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
    4957          38 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
    4958             :       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
    4959          20 :     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
    4960             :       Opcode = AMDGPUISD::CVT_PK_I16_I32;
    4961             :     else
    4962             :       Opcode = AMDGPUISD::CVT_PK_U16_U32;
    4963             : 
    4964             :     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
    4965          95 :                                Op.getOperand(1), Op.getOperand(2));
    4966          95 :     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    4967             :   }
    4968          13 :   case Intrinsic::amdgcn_wqm: {
    4969          13 :     SDValue Src = Op.getOperand(1);
    4970          13 :     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
    4971          13 :                    0);
    4972             :   }
    4973          16 :   case Intrinsic::amdgcn_wwm: {
    4974          16 :     SDValue Src = Op.getOperand(1);
    4975          16 :     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
    4976          16 :                    0);
    4977             :   }
    4978             :   case Intrinsic::amdgcn_fmad_ftz:
    4979             :     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
    4980         106 :                        Op.getOperand(2), Op.getOperand(3));
    4981        1113 :   default:
    4982        1113 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    4983        1113 :             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
    4984          36 :       return lowerImage(Op, ImageDimIntr, DAG);
    4985             : 
    4986        1077 :     return Op;
    4987             :   }
    4988             : }
    4989             : 
    4990        1106 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
    4991             :                                                  SelectionDAG &DAG) const {
    4992        2212 :   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    4993             :   SDLoc DL(Op);
    4994             : 
    4995        1106 :   switch (IntrID) {
    4996             :   case Intrinsic::amdgcn_atomic_inc:
    4997             :   case Intrinsic::amdgcn_atomic_dec:
    4998             :   case Intrinsic::amdgcn_ds_fadd:
    4999             :   case Intrinsic::amdgcn_ds_fmin:
    5000             :   case Intrinsic::amdgcn_ds_fmax: {
    5001             :     MemSDNode *M = cast<MemSDNode>(Op);
    5002             :     unsigned Opc;
    5003         245 :     switch (IntrID) {
    5004             :     case Intrinsic::amdgcn_atomic_inc:
    5005             :       Opc = AMDGPUISD::ATOMIC_INC;
    5006             :       break;
    5007         115 :     case Intrinsic::amdgcn_atomic_dec:
    5008             :       Opc = AMDGPUISD::ATOMIC_DEC;
    5009         115 :       break;
    5010           6 :     case Intrinsic::amdgcn_ds_fadd:
    5011             :       Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
    5012           6 :       break;
    5013           6 :     case Intrinsic::amdgcn_ds_fmin:
    5014             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
    5015           6 :       break;
    5016           6 :     case Intrinsic::amdgcn_ds_fmax:
    5017             :       Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
    5018           6 :       break;
    5019           0 :     default:
    5020           0 :       llvm_unreachable("Unknown intrinsic!");
    5021             :     }
    5022             :     SDValue Ops[] = {
    5023         245 :       M->getOperand(0), // Chain
    5024             :       M->getOperand(2), // Ptr
    5025             :       M->getOperand(3)  // Value
    5026         245 :     };
    5027             : 
    5028         245 :     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
    5029         980 :                                    M->getMemoryVT(), M->getMemOperand());
    5030             :   }
    5031         190 :   case Intrinsic::amdgcn_buffer_load:
    5032             :   case Intrinsic::amdgcn_buffer_load_format: {
    5033             :     SDValue Ops[] = {
    5034             :       Op.getOperand(0), // Chain
    5035             :       Op.getOperand(2), // rsrc
    5036             :       Op.getOperand(3), // vindex
    5037             :       Op.getOperand(4), // offset
    5038             :       Op.getOperand(5), // glc
    5039             :       Op.getOperand(6)  // slc
    5040         190 :     };
    5041             : 
    5042         190 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
    5043             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    5044         190 :     EVT VT = Op.getValueType();
    5045         190 :     EVT IntVT = VT.changeTypeToInteger();
    5046             :     auto *M = cast<MemSDNode>(Op);
    5047         190 :     EVT LoadVT = Op.getValueType();
    5048         380 :     bool IsD16 = LoadVT.getScalarType() == MVT::f16;
    5049             :     if (IsD16)
    5050           9 :       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
    5051             : 
    5052             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
    5053         543 :                                    M->getMemOperand());
    5054             :   }
    5055             :   case Intrinsic::amdgcn_tbuffer_load: {
    5056             :     MemSDNode *M = cast<MemSDNode>(Op);
    5057          37 :     EVT LoadVT = Op.getValueType();
    5058          74 :     bool IsD16 = LoadVT.getScalarType() == MVT::f16;
    5059             :     if (IsD16) {
    5060           9 :       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
    5061             :     }
    5062             : 
    5063             :     SDValue Ops[] = {
    5064             :       Op.getOperand(0),  // Chain
    5065             :       Op.getOperand(2),  // rsrc
    5066             :       Op.getOperand(3),  // vindex
    5067             :       Op.getOperand(4),  // voffset
    5068             :       Op.getOperand(5),  // soffset
    5069             :       Op.getOperand(6),  // offset
    5070             :       Op.getOperand(7),  // dfmt
    5071             :       Op.getOperand(8),  // nfmt
    5072             :       Op.getOperand(9),  // glc
    5073             :       Op.getOperand(10)   // slc
    5074          28 :     };
    5075             : 
    5076             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    5077             :                                    Op->getVTList(), Ops, LoadVT,
    5078          84 :                                    M->getMemOperand());
    5079             :   }
    5080          34 :   case Intrinsic::amdgcn_buffer_atomic_swap:
    5081             :   case Intrinsic::amdgcn_buffer_atomic_add:
    5082             :   case Intrinsic::amdgcn_buffer_atomic_sub:
    5083             :   case Intrinsic::amdgcn_buffer_atomic_smin:
    5084             :   case Intrinsic::amdgcn_buffer_atomic_umin:
    5085             :   case Intrinsic::amdgcn_buffer_atomic_smax:
    5086             :   case Intrinsic::amdgcn_buffer_atomic_umax:
    5087             :   case Intrinsic::amdgcn_buffer_atomic_and:
    5088             :   case Intrinsic::amdgcn_buffer_atomic_or:
    5089             :   case Intrinsic::amdgcn_buffer_atomic_xor: {
    5090             :     SDValue Ops[] = {
    5091             :       Op.getOperand(0), // Chain
    5092             :       Op.getOperand(2), // vdata
    5093             :       Op.getOperand(3), // rsrc
    5094             :       Op.getOperand(4), // vindex
    5095             :       Op.getOperand(5), // offset
    5096             :       Op.getOperand(6)  // slc
    5097          34 :     };
    5098          34 :     EVT VT = Op.getValueType();
    5099             : 
    5100             :     auto *M = cast<MemSDNode>(Op);
    5101             :     unsigned Opcode = 0;
    5102             : 
    5103          34 :     switch (IntrID) {
    5104             :     case Intrinsic::amdgcn_buffer_atomic_swap:
    5105             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
    5106             :       break;
    5107           4 :     case Intrinsic::amdgcn_buffer_atomic_add:
    5108             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
    5109           4 :       break;
    5110           2 :     case Intrinsic::amdgcn_buffer_atomic_sub:
    5111             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
    5112           2 :       break;
    5113           2 :     case Intrinsic::amdgcn_buffer_atomic_smin:
    5114             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
    5115           2 :       break;
    5116           2 :     case Intrinsic::amdgcn_buffer_atomic_umin:
    5117             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
    5118           2 :       break;
    5119           2 :     case Intrinsic::amdgcn_buffer_atomic_smax:
    5120             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
    5121           2 :       break;
    5122           2 :     case Intrinsic::amdgcn_buffer_atomic_umax:
    5123             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
    5124           2 :       break;
    5125           2 :     case Intrinsic::amdgcn_buffer_atomic_and:
    5126             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
    5127           2 :       break;
    5128           2 :     case Intrinsic::amdgcn_buffer_atomic_or:
    5129             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
    5130           2 :       break;
    5131           2 :     case Intrinsic::amdgcn_buffer_atomic_xor:
    5132             :       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
    5133           2 :       break;
    5134           0 :     default:
    5135           0 :       llvm_unreachable("unhandled atomic opcode");
    5136             :     }
    5137             : 
    5138             :     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
    5139         102 :                                    M->getMemOperand());
    5140             :   }
    5141             : 
    5142          12 :   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
    5143             :     SDValue Ops[] = {
    5144             :       Op.getOperand(0), // Chain
    5145             :       Op.getOperand(2), // src
    5146             :       Op.getOperand(3), // cmp
    5147             :       Op.getOperand(4), // rsrc
    5148             :       Op.getOperand(5), // vindex
    5149             :       Op.getOperand(6), // offset
    5150             :       Op.getOperand(7)  // slc
    5151          12 :     };
    5152          12 :     EVT VT = Op.getValueType();
    5153             :     auto *M = cast<MemSDNode>(Op);
    5154             : 
    5155             :     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
    5156          36 :                                    Op->getVTList(), Ops, VT, M->getMemOperand());
    5157             :   }
    5158             : 
    5159         588 :   default:
    5160         588 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    5161         588 :             AMDGPU::getImageDimIntrinsicInfo(IntrID))
    5162         536 :       return lowerImage(Op, ImageDimIntr, DAG);
    5163             : 
    5164          52 :     return SDValue();
    5165             :   }
    5166             : }
    5167             : 
    5168          30 : SDValue SITargetLowering::handleD16VData(SDValue VData,
    5169             :                                          SelectionDAG &DAG) const {
    5170          30 :   EVT StoreVT = VData.getValueType();
    5171             : 
    5172             :   // No change for f16 and legal vector D16 types.
    5173          30 :   if (!StoreVT.isVector())
    5174           9 :     return VData;
    5175             : 
    5176             :   SDLoc DL(VData);
    5177             :   assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
    5178             : 
    5179          21 :   if (Subtarget->hasUnpackedD16VMem()) {
    5180             :     // We need to unpack the packed data to store.
    5181           7 :     EVT IntStoreVT = StoreVT.changeTypeToInteger();
    5182           7 :     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
    5183             : 
    5184           7 :     EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
    5185          14 :                                         StoreVT.getVectorNumElements());
    5186           7 :     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
    5187           7 :     return DAG.UnrollVectorOp(ZExt.getNode());
    5188             :   }
    5189             : 
    5190             :   assert(isTypeLegal(StoreVT));
    5191          14 :   return VData;
    5192             : }
    5193             : 
    5194        2179 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
    5195             :                                               SelectionDAG &DAG) const {
    5196             :   SDLoc DL(Op);
    5197        2179 :   SDValue Chain = Op.getOperand(0);
    5198        4358 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    5199        2179 :   MachineFunction &MF = DAG.getMachineFunction();
    5200             : 
    5201        2179 :   switch (IntrinsicID) {
    5202         347 :   case Intrinsic::amdgcn_exp: {
    5203             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5204             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5205             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
    5206             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
    5207             : 
    5208             :     const SDValue Ops[] = {
    5209             :       Chain,
    5210         347 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5211         347 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5212             :       Op.getOperand(4), // src0
    5213             :       Op.getOperand(5), // src1
    5214             :       Op.getOperand(6), // src2
    5215             :       Op.getOperand(7), // src3
    5216             :       DAG.getTargetConstant(0, DL, MVT::i1), // compr
    5217         347 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5218        2082 :     };
    5219             : 
    5220         694 :     unsigned Opc = Done->isNullValue() ?
    5221             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5222         694 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5223             :   }
    5224          96 :   case Intrinsic::amdgcn_exp_compr: {
    5225             :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    5226             :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    5227          96 :     SDValue Src0 = Op.getOperand(4);
    5228          96 :     SDValue Src1 = Op.getOperand(5);
    5229             :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
    5230             :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
    5231             : 
    5232          96 :     SDValue Undef = DAG.getUNDEF(MVT::f32);
    5233             :     const SDValue Ops[] = {
    5234             :       Chain,
    5235          96 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    5236          96 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    5237          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
    5238          96 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
    5239             :       Undef, // src2
    5240             :       Undef, // src3
    5241             :       DAG.getTargetConstant(1, DL, MVT::i1), // compr
    5242          96 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    5243         768 :     };
    5244             : 
    5245         192 :     unsigned Opc = Done->isNullValue() ?
    5246             :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    5247         192 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    5248             :   }
    5249          26 :   case Intrinsic::amdgcn_s_sendmsg:
    5250             :   case Intrinsic::amdgcn_s_sendmsghalt: {
    5251          26 :     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
    5252             :       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
    5253          26 :     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    5254          26 :     SDValue Glue = Chain.getValue(1);
    5255             :     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
    5256          26 :                        Op.getOperand(2), Glue);
    5257             :   }
    5258             :   case Intrinsic::amdgcn_init_exec: {
    5259             :     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
    5260           3 :                        Op.getOperand(2));
    5261             :   }
    5262             :   case Intrinsic::amdgcn_init_exec_from_input: {
    5263             :     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
    5264           4 :                        Op.getOperand(2), Op.getOperand(3));
    5265             :   }
    5266          31 :   case AMDGPUIntrinsic::AMDGPU_kill: {
    5267          31 :     SDValue Src = Op.getOperand(2);
    5268             :     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
    5269          22 :       if (!K->isNegative())
    5270           4 :         return Chain;
    5271             : 
    5272           7 :       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
    5273           7 :       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
    5274             :     }
    5275             : 
    5276          20 :     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
    5277          20 :     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
    5278             :   }
    5279         141 :   case Intrinsic::amdgcn_s_barrier: {
    5280         141 :     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
    5281         133 :       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    5282         133 :       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
    5283         133 :       if (WGSize <= ST.getWavefrontSize())
    5284          10 :         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
    5285           5 :                                           Op.getOperand(0)), 0);
    5286             :     }
    5287         136 :     return SDValue();
    5288             :   };
    5289          14 :   case AMDGPUIntrinsic::SI_tbuffer_store: {
    5290             : 
    5291             :     // Extract vindex and voffset from vaddr as appropriate
    5292             :     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
    5293             :     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
    5294          14 :     SDValue VAddr = Op.getOperand(5);
    5295             : 
    5296          14 :     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
    5297             : 
    5298             :     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
    5299             :            "Legacy intrinsic doesn't support both offset and index - use new version");
    5300             : 
    5301          28 :     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
    5302          28 :     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
    5303             : 
    5304             :     // Deal with the vec-3 case
    5305             :     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
    5306          28 :     auto Opcode = NumChannels->getZExtValue() == 3 ?
    5307             :       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
    5308             : 
    5309             :     SDValue Ops[] = {
    5310             :      Chain,
    5311             :      Op.getOperand(3),  // vdata
    5312             :      Op.getOperand(2),  // rsrc
    5313             :      VIndex,
    5314             :      VOffset,
    5315             :      Op.getOperand(6),  // soffset
    5316             :      Op.getOperand(7),  // inst_offset
    5317             :      Op.getOperand(8),  // dfmt
    5318             :      Op.getOperand(9),  // nfmt
    5319             :      Op.getOperand(12), // glc
    5320             :      Op.getOperand(13), // slc
    5321          14 :     };
    5322             : 
    5323             :     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
    5324             :            "Value of tfe other than zero is unsupported");
    5325             : 
    5326          28 :     EVT VT = Op.getOperand(3).getValueType();
    5327          28 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    5328             :       MachinePointerInfo(),
    5329             :       MachineMemOperand::MOStore,
    5330          14 :       VT.getStoreSize(), 4);
    5331             :     return DAG.getMemIntrinsicNode(Opcode, DL,
    5332          28 :                                    Op->getVTList(), Ops, VT, MMO);
    5333             :   }
    5334             : 
    5335          41 :   case Intrinsic::amdgcn_tbuffer_store: {
    5336          41 :     SDValue VData = Op.getOperand(2);
    5337          82 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5338          41 :     if (IsD16)
    5339           9 :       VData = handleD16VData(VData, DAG);
    5340             :     SDValue Ops[] = {
    5341             :       Chain,
    5342             :       VData,             // vdata
    5343             :       Op.getOperand(3),  // rsrc
    5344             :       Op.getOperand(4),  // vindex
    5345             :       Op.getOperand(5),  // voffset
    5346             :       Op.getOperand(6),  // soffset
    5347             :       Op.getOperand(7),  // offset
    5348             :       Op.getOperand(8),  // dfmt
    5349             :       Op.getOperand(9),  // nfmt
    5350             :       Op.getOperand(10), // glc
    5351             :       Op.getOperand(11)  // slc
    5352          82 :     };
    5353          41 :     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
    5354             :                            AMDGPUISD::TBUFFER_STORE_FORMAT;
    5355             :     MemSDNode *M = cast<MemSDNode>(Op);
    5356             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5357         123 :                                    M->getMemoryVT(), M->getMemOperand());
    5358             :   }
    5359             : 
    5360         153 :   case Intrinsic::amdgcn_buffer_store:
    5361             :   case Intrinsic::amdgcn_buffer_store_format: {
    5362         153 :     SDValue VData = Op.getOperand(2);
    5363         306 :     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
    5364         153 :     if (IsD16)
    5365           9 :       VData = handleD16VData(VData, DAG);
    5366             :     SDValue Ops[] = {
    5367             :       Chain,
    5368             :       VData,            // vdata
    5369             :       Op.getOperand(3), // rsrc
    5370             :       Op.getOperand(4), // vindex
    5371             :       Op.getOperand(5), // offset
    5372             :       Op.getOperand(6), // glc
    5373             :       Op.getOperand(7)  // slc
    5374         306 :     };
    5375         153 :     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
    5376             :                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
    5377         153 :     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
    5378             :     MemSDNode *M = cast<MemSDNode>(Op);
    5379             :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
    5380         459 :                                    M->getMemoryVT(), M->getMemOperand());
    5381             :   }
    5382        1323 :   default: {
    5383        1323 :     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
    5384        1323 :             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
    5385          87 :       return lowerImage(Op, ImageDimIntr, DAG);
    5386             : 
    5387        1236 :     return Op;
    5388             :   }
    5389             :   }
    5390             : }
    5391             : 
    5392          34 : static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
    5393             :                                  ISD::LoadExtType ExtType, SDValue Op,
    5394             :                                  const SDLoc &SL, EVT VT) {
    5395          34 :   if (VT.bitsLT(Op.getValueType()))
    5396           8 :     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
    5397             : 
    5398          26 :   switch (ExtType) {
    5399             :   case ISD::SEXTLOAD:
    5400           2 :     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
    5401             :   case ISD::ZEXTLOAD:
    5402          16 :     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
    5403             :   case ISD::EXTLOAD:
    5404           8 :     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
    5405           0 :   case ISD::NON_EXTLOAD:
    5406           0 :     return Op;
    5407             :   }
    5408             : 
    5409           0 :   llvm_unreachable("invalid ext type");
    5410             : }
    5411             : 
    5412      294158 : SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
    5413      294158 :   SelectionDAG &DAG = DCI.DAG;
    5414      857096 :   if (Ld->getAlignment() < 4 || Ld->isDivergent())
    5415       73254 :     return SDValue();
    5416             : 
    5417             :   // FIXME: Constant loads should all be marked invariant.
    5418             :   unsigned AS = Ld->getAddressSpace();
    5419      441808 :   if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
    5420      441808 :       AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
    5421       28155 :       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
    5422       77584 :     return SDValue();
    5423             : 
    5424             :   // Don't do this early, since it may interfere with adjacent load merging for
    5425             :   // illegal types. We can avoid losing alignment information for exotic types
    5426             :   // pre-legalize.
    5427      143320 :   EVT MemVT = Ld->getMemoryVT();
    5428      180923 :   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
    5429       37603 :       MemVT.getSizeInBits() >= 32)
    5430      143286 :     return SDValue();
    5431             : 
    5432             :   SDLoc SL(Ld);
    5433             : 
    5434             :   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
    5435             :          "unexpected vector extload");
    5436             : 
    5437             :   // TODO: Drop only high part of range.
    5438          34 :   SDValue Ptr = Ld->getBasePtr();
    5439             :   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
    5440             :                                 MVT::i32, SL, Ld->getChain(), Ptr,
    5441             :                                 Ld->getOffset(),
    5442          34 :                                 Ld->getPointerInfo(), MVT::i32,
    5443             :                                 Ld->getAlignment(),
    5444          34 :                                 Ld->getMemOperand()->getFlags(),
    5445          68 :                                 Ld->getAAInfo(),
    5446          68 :                                 nullptr); // Drop ranges
    5447             : 
    5448          34 :   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
    5449          34 :   if (MemVT.isFloatingPoint()) {
    5450             :     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
    5451             :            "unexpected fp extload");
    5452           0 :     TruncVT = MemVT.changeTypeToInteger();
    5453             :   }
    5454             : 
    5455          34 :   SDValue Cvt = NewLoad;
    5456          34 :   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
    5457           2 :     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
    5458           4 :                       DAG.getValueType(TruncVT));
    5459          32 :   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
    5460             :              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
    5461          24 :     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
    5462             :   } else {
    5463             :     assert(Ld->getExtensionType() == ISD::EXTLOAD);
    5464             :   }
    5465             : 
    5466          68 :   EVT VT = Ld->getValueType(0);
    5467          34 :   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
    5468             : 
    5469          34 :   DCI.AddToWorklist(Cvt.getNode());
    5470             : 
    5471             :   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
    5472             :   // the appropriate extension from the 32-bit load.
    5473          34 :   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
    5474          34 :   DCI.AddToWorklist(Cvt.getNode());
    5475             : 
    5476             :   // Handle conversion back to floating point if necessary.
    5477          34 :   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
    5478             : 
    5479          68 :   return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
    5480             : }
    5481             : 
    5482       69106 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    5483             :   SDLoc DL(Op);
    5484             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    5485             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    5486       69106 :   EVT MemVT = Load->getMemoryVT();
    5487             : 
    5488       69106 :   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
    5489             :     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
    5490        2185 :       return SDValue();
    5491             : 
    5492             :     // FIXME: Copied from PPC
    5493             :     // First, load into 32 bits, then truncate to 1 bit.
    5494             : 
    5495         117 :     SDValue Chain = Load->getChain();
    5496         117 :     SDValue BasePtr = Load->getBasePtr();
    5497         117 :     MachineMemOperand *MMO = Load->getMemOperand();
    5498             : 
    5499             :     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
    5500             : 
    5501             :     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
    5502         117 :                                    BasePtr, RealMemVT, MMO);
    5503             : 
    5504             :     SDValue Ops[] = {
    5505         117 :       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
    5506             :       NewLD.getValue(1)
    5507         234 :     };
    5508             : 
    5509         117 :     return DAG.getMergeValues(Ops, DL);
    5510             :   }
    5511             : 
    5512       66804 :   if (!MemVT.isVector())
    5513           0 :     return SDValue();
    5514             : 
    5515             :   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
    5516             :          "Custom lowering for non-i32 vectors hasn't been implemented.");
    5517             : 
    5518       66804 :   unsigned Alignment = Load->getAlignment();
    5519             :   unsigned AS = Load->getAddressSpace();
    5520      133608 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
    5521             :                           AS, Alignment)) {
    5522           2 :     SDValue Ops[2];
    5523           4 :     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
    5524           2 :     return DAG.getMergeValues(Ops, DL);
    5525             :   }
    5526             : 
    5527       66802 :   MachineFunction &MF = DAG.getMachineFunction();
    5528       66802 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5529             :   // If there is a possibilty that flat instruction access scratch memory
    5530             :   // then we need to use the same legalization rules we use for private.
    5531       66802 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5532          27 :     AS = MFI->hasFlatScratchInit() ?
    5533             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5534             : 
    5535       66802 :   unsigned NumElements = MemVT.getVectorNumElements();
    5536             : 
    5537      133604 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5538       66802 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
    5539       43114 :     if (!Op->isDivergent() && Alignment >= 4)
    5540       42863 :       return SDValue();
    5541             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5542             :     // have the same legalization requirements as global and private
    5543             :     // loads.
    5544             :     //
    5545             :   }
    5546             : 
    5547       23939 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5548       23688 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5549             :       AS == AMDGPUASI.GLOBAL_ADDRESS) {
    5550       23125 :     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
    5551       15536 :         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
    5552             :         Alignment >= 4)
    5553         783 :       return SDValue();
    5554             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    5555             :     // have the same legalization requirements as global and private
    5556             :     // loads.
    5557             :     //
    5558             :   }
    5559       23156 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
    5560       22905 :       AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
    5561       10661 :       AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5562       10661 :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5563       12495 :     if (NumElements > 4)
    5564        1215 :       return SplitVectorLoad(Op, DAG);
    5565             :     // v4 loads are supported for private and global memory.
    5566       11280 :     return SDValue();
    5567             :   }
    5568       10661 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    5569             :     // Depending on the setting of the private_element_size field in the
    5570             :     // resource descriptor, we can only make private accesses up to a certain
    5571             :     // size.
    5572         380 :     switch (Subtarget->getMaxPrivateElementSize()) {
    5573         217 :     case 4:
    5574         217 :       return scalarizeVectorLoad(Load, DAG);
    5575          53 :     case 8:
    5576          53 :       if (NumElements > 2)
    5577           5 :         return SplitVectorLoad(Op, DAG);
    5578          48 :       return SDValue();
    5579         110 :     case 16:
    5580             :       // Same as global/flat
    5581         110 :       if (NumElements > 4)
    5582           1 :         return SplitVectorLoad(Op, DAG);
    5583         109 :       return SDValue();
    5584           0 :     default:
    5585           0 :       llvm_unreachable("unsupported private_element_size");
    5586             :     }
    5587       10281 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    5588             :     // Use ds_read_b128 if possible.
    5589       14039 :     if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
    5590             :         MemVT.getStoreSize() == 16)
    5591         996 :       return SDValue();
    5592             : 
    5593        9285 :     if (NumElements > 2)
    5594        1156 :       return SplitVectorLoad(Op, DAG);
    5595             :   }
    5596        8129 :   return SDValue();
    5597             : }
    5598             : 
    5599         658 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    5600         658 :   EVT VT = Op.getValueType();
    5601             :   assert(VT.getSizeInBits() == 64);
    5602             : 
    5603             :   SDLoc DL(Op);
    5604         658 :   SDValue Cond = Op.getOperand(0);
    5605             : 
    5606         658 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    5607         658 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    5608             : 
    5609         658 :   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
    5610         658 :   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
    5611             : 
    5612         658 :   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
    5613         658 :   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
    5614             : 
    5615         658 :   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
    5616             : 
    5617         658 :   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
    5618         658 :   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
    5619             : 
    5620         658 :   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
    5621             : 
    5622        1316 :   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
    5623        1316 :   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
    5624             : }
    5625             : 
    5626             : // Catch division cases where we can use shortcuts with rcp and rsq
    5627             : // instructions.
    5628         176 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
    5629             :                                               SelectionDAG &DAG) const {
    5630             :   SDLoc SL(Op);
    5631         176 :   SDValue LHS = Op.getOperand(0);
    5632         176 :   SDValue RHS = Op.getOperand(1);
    5633         176 :   EVT VT = Op.getValueType();
    5634         176 :   const SDNodeFlags Flags = Op->getFlags();
    5635         176 :   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
    5636             : 
    5637         139 :   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
    5638          16 :     return SDValue();
    5639             : 
    5640             :   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    5641         100 :     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    5642         200 :       if (CLHS->isExactlyValue(1.0)) {
    5643             :         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
    5644             :         // the CI documentation has a worst case error of 1 ulp.
    5645             :         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
    5646             :         // use it as long as we aren't trying to use denormals.
    5647             :         //
    5648             :         // v_rcp_f16 and v_rsq_f16 DO support denormals.
    5649             : 
    5650             :         // 1.0 / sqrt(x) -> rsq(x)
    5651             : 
    5652             :         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
    5653             :         // error seems really high at 2^29 ULP.
    5654          67 :         if (RHS.getOpcode() == ISD::FSQRT)
    5655           7 :           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
    5656             : 
    5657             :         // 1.0 / x -> rcp(x)
    5658          60 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5659             :       }
    5660             : 
    5661             :       // Same as for 1.0, but expand the sign out of the constant.
    5662          66 :       if (CLHS->isExactlyValue(-1.0)) {
    5663             :         // -1.0 / x -> rcp (fneg x)
    5664          33 :         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    5665          33 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    5666             :       }
    5667             :     }
    5668             :   }
    5669             : 
    5670          60 :   if (Unsafe) {
    5671             :     // Turn into multiply by the reciprocal.
    5672             :     // x / y -> x * (1.0 / y)
    5673          12 :     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    5674          12 :     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
    5675             :   }
    5676             : 
    5677          48 :   return SDValue();
    5678             : }
    5679             : 
    5680          61 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5681             :                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
    5682          61 :   if (GlueChain->getNumValues() <= 1) {
    5683          16 :     return DAG.getNode(Opcode, SL, VT, A, B);
    5684             :   }
    5685             : 
    5686             :   assert(GlueChain->getNumValues() == 3);
    5687             : 
    5688          45 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5689          45 :   switch (Opcode) {
    5690           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5691          45 :   case ISD::FMUL:
    5692             :     Opcode = AMDGPUISD::FMUL_W_CHAIN;
    5693             :     break;
    5694             :   }
    5695             : 
    5696             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
    5697          45 :                      GlueChain.getValue(2));
    5698             : }
    5699             : 
    5700         305 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    5701             :                            EVT VT, SDValue A, SDValue B, SDValue C,
    5702             :                            SDValue GlueChain) {
    5703         305 :   if (GlueChain->getNumValues() <= 1) {
    5704          80 :     return DAG.getNode(Opcode, SL, VT, A, B, C);
    5705             :   }
    5706             : 
    5707             :   assert(GlueChain->getNumValues() == 3);
    5708             : 
    5709         225 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    5710         225 :   switch (Opcode) {
    5711           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    5712         225 :   case ISD::FMA:
    5713             :     Opcode = AMDGPUISD::FMA_W_CHAIN;
    5714             :     break;
    5715             :   }
    5716             : 
    5717             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
    5718         225 :                      GlueChain.getValue(2));
    5719             : }
    5720             : 
    5721          27 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
    5722          27 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5723          24 :     return FastLowered;
    5724             : 
    5725             :   SDLoc SL(Op);
    5726           3 :   SDValue Src0 = Op.getOperand(0);
    5727           3 :   SDValue Src1 = Op.getOperand(1);
    5728             : 
    5729           3 :   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    5730           3 :   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    5731             : 
    5732           3 :   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
    5733           3 :   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
    5734             : 
    5735           3 :   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
    5736           3 :   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
    5737             : 
    5738           3 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
    5739             : }
    5740             : 
    5741             : // Faster 2.5 ULP division that does not support denormals.
    5742          33 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
    5743             :   SDLoc SL(Op);
    5744          33 :   SDValue LHS = Op.getOperand(1);
    5745          33 :   SDValue RHS = Op.getOperand(2);
    5746             : 
    5747          33 :   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
    5748             : 
    5749          33 :   const APFloat K0Val(BitsToFloat(0x6f800000));
    5750          33 :   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
    5751             : 
    5752          33 :   const APFloat K1Val(BitsToFloat(0x2f800000));
    5753          33 :   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
    5754             : 
    5755          33 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5756             : 
    5757             :   EVT SetCCVT =
    5758          66 :     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
    5759             : 
    5760          33 :   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
    5761             : 
    5762          33 :   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
    5763             : 
    5764             :   // TODO: Should this propagate fast-math-flags?
    5765          33 :   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
    5766             : 
    5767             :   // rcp does not support denormals.
    5768          33 :   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
    5769             : 
    5770          33 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
    5771             : 
    5772          66 :   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
    5773             : }
    5774             : 
    5775         142 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
    5776         142 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    5777          81 :     return FastLowered;
    5778             : 
    5779             :   SDLoc SL(Op);
    5780          61 :   SDValue LHS = Op.getOperand(0);
    5781          61 :   SDValue RHS = Op.getOperand(1);
    5782             : 
    5783          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    5784             : 
    5785          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
    5786             : 
    5787             :   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5788          61 :                                           RHS, RHS, LHS);
    5789             :   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    5790          61 :                                         LHS, RHS, LHS);
    5791             : 
    5792             :   // Denominator is scaled to not be denormal, so using rcp is ok.
    5793             :   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
    5794          61 :                                   DenominatorScaled);
    5795             :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
    5796          61 :                                      DenominatorScaled);
    5797             : 
    5798             :   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
    5799             :                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
    5800             :                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
    5801             : 
    5802          61 :   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
    5803             : 
    5804          61 :   if (!Subtarget->hasFP32Denormals()) {
    5805          45 :     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
    5806             :     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
    5807          45 :                                                       SL, MVT::i32);
    5808             :     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
    5809             :                                        DAG.getEntryNode(),
    5810          45 :                                        EnableDenormValue, BitField);
    5811             :     SDValue Ops[3] = {
    5812             :       NegDivScale0,
    5813             :       EnableDenorm.getValue(0),
    5814             :       EnableDenorm.getValue(1)
    5815          45 :     };
    5816             : 
    5817          45 :     NegDivScale0 = DAG.getMergeValues(Ops, SL);
    5818             :   }
    5819             : 
    5820             :   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
    5821          61 :                              ApproxRcp, One, NegDivScale0);
    5822             : 
    5823             :   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
    5824          61 :                              ApproxRcp, Fma0);
    5825             : 
    5826             :   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
    5827          61 :                            Fma1, Fma1);
    5828             : 
    5829             :   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
    5830          61 :                              NumeratorScaled, Mul);
    5831             : 
    5832          61 :   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
    5833             : 
    5834             :   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
    5835          61 :                              NumeratorScaled, Fma3);
    5836             : 
    5837          61 :   if (!Subtarget->hasFP32Denormals()) {
    5838             :     const SDValue DisableDenormValue =
    5839          45 :         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
    5840             :     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
    5841             :                                         Fma4.getValue(1),
    5842             :                                         DisableDenormValue,
    5843             :                                         BitField,
    5844          45 :                                         Fma4.getValue(2));
    5845             : 
    5846             :     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    5847          45 :                                       DisableDenorm, DAG.getRoot());
    5848          45 :     DAG.setRoot(OutputChain);
    5849             :   }
    5850             : 
    5851          61 :   SDValue Scale = NumeratorScaled.getValue(1);
    5852             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
    5853          61 :                              Fma4, Fma1, Fma3, Scale);
    5854             : 
    5855          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
    5856             : }
    5857             : 
    5858          68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
    5859          68 :   if (DAG.getTarget().Options.UnsafeFPMath)
    5860           7 :     return lowerFastUnsafeFDIV(Op, DAG);
    5861             : 
    5862             :   SDLoc SL(Op);
    5863          61 :   SDValue X = Op.getOperand(0);
    5864          61 :   SDValue Y = Op.getOperand(1);
    5865             : 
    5866          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    5867             : 
    5868          61 :   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
    5869             : 
    5870          61 :   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
    5871             : 
    5872          61 :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
    5873             : 
    5874          61 :   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
    5875             : 
    5876          61 :   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
    5877             : 
    5878          61 :   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
    5879             : 
    5880          61 :   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
    5881             : 
    5882          61 :   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
    5883             : 
    5884          61 :   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
    5885          61 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
    5886             : 
    5887             :   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
    5888          61 :                              NegDivScale0, Mul, DivScale1);
    5889             : 
    5890          61 :   SDValue Scale;
    5891             : 
    5892          61 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
    5893             :     // Workaround a hardware bug on SI where the condition output from div_scale
    5894             :     // is not usable.
    5895             : 
    5896          23 :     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
    5897             : 
    5898             :     // Figure out if the scale to use for div_fmas.
    5899          23 :     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    5900          23 :     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
    5901          23 :     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
    5902          23 :     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
    5903             : 
    5904          23 :     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
    5905          23 :     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
    5906             : 
    5907             :     SDValue Scale0Hi
    5908          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
    5909             :     SDValue Scale1Hi
    5910          23 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
    5911             : 
    5912          23 :     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
    5913          23 :     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
    5914          23 :     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
    5915             :   } else {
    5916          38 :     Scale = DivScale1.getValue(1);
    5917             :   }
    5918             : 
    5919             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
    5920          61 :                              Fma4, Fma3, Mul, Scale);
    5921             : 
    5922          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
    5923             : }
    5924             : 
    5925         237 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
    5926         237 :   EVT VT = Op.getValueType();
    5927             : 
    5928             :   if (VT == MVT::f32)
    5929         142 :     return LowerFDIV32(Op, DAG);
    5930             : 
    5931             :   if (VT == MVT::f64)
    5932          68 :     return LowerFDIV64(Op, DAG);
    5933             : 
    5934             :   if (VT == MVT::f16)
    5935          27 :     return LowerFDIV16(Op, DAG);
    5936             : 
    5937           0 :   llvm_unreachable("Unexpected type for fdiv");
    5938             : }
    5939             : 
    5940       77919 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    5941             :   SDLoc DL(Op);
    5942             :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    5943       77919 :   EVT VT = Store->getMemoryVT();
    5944             : 
    5945             :   if (VT == MVT::i1) {
    5946             :     return DAG.getTruncStore(Store->getChain(), DL,
    5947             :        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
    5948         558 :        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    5949             :   }
    5950             : 
    5951             :   assert(VT.isVector() &&
    5952             :          Store->getValue().getValueType().getScalarType() == MVT::i32);
    5953             : 
    5954             :   unsigned AS = Store->getAddressSpace();
    5955      233199 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
    5956             :                           AS, Store->getAlignment())) {
    5957          26 :     return expandUnalignedStore(Store, DAG);
    5958             :   }
    5959             : 
    5960       77707 :   MachineFunction &MF = DAG.getMachineFunction();
    5961       77707 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    5962             :   // If there is a possibilty that flat instruction access scratch memory
    5963             :   // then we need to use the same legalization rules we use for private.
    5964       77707 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    5965         261 :     AS = MFI->hasFlatScratchInit() ?
    5966             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    5967             : 
    5968       77707 :   unsigned NumElements = VT.getVectorNumElements();
    5969       77707 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
    5970             :       AS == AMDGPUASI.FLAT_ADDRESS) {
    5971       37958 :     if (NumElements > 4)
    5972        3834 :       return SplitVectorStore(Op, DAG);
    5973       34124 :     return SDValue();
    5974       39749 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    5975         545 :     switch (Subtarget->getMaxPrivateElementSize()) {
    5976         325 :     case 4:
    5977         325 :       return scalarizeVectorStore(Store, DAG);
    5978          86 :     case 8:
    5979          86 :       if (NumElements > 2)
    5980          10 :         return SplitVectorStore(Op, DAG);
    5981          76 :       return SDValue();
    5982         134 :     case 16:
    5983         134 :       if (NumElements > 4)
    5984           2 :         return SplitVectorStore(Op, DAG);
    5985         132 :       return SDValue();
    5986           0 :     default:
    5987           0 :       llvm_unreachable("unsupported private_element_size");
    5988             :     }
    5989       39204 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    5990             :     // Use ds_write_b128 if possible.
    5991       53562 :     if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
    5992             :         VT.getStoreSize() == 16)
    5993        4160 :       return SDValue();
    5994             : 
    5995       35044 :     if (NumElements > 2)
    5996        3740 :       return SplitVectorStore(Op, DAG);
    5997       31304 :     return SDValue();
    5998             :   } else {
    5999           0 :     llvm_unreachable("unhandled address space");
    6000             :   }
    6001             : }
    6002             : 
    6003          51 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    6004             :   SDLoc DL(Op);
    6005          51 :   EVT VT = Op.getValueType();
    6006          51 :   SDValue Arg = Op.getOperand(0);
    6007             :   // TODO: Should this propagate fast-math-flags?
    6008             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    6009             :                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
    6010             :                                               DAG.getConstantFP(0.5/M_PI, DL,
    6011          51 :                                                                 VT)));
    6012             : 
    6013          51 :   switch (Op.getOpcode()) {
    6014             :   case ISD::FCOS:
    6015          48 :     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
    6016             :   case ISD::FSIN:
    6017          54 :     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
    6018           0 :   default:
    6019           0 :     llvm_unreachable("Wrong trig opcode");
    6020             :   }
    6021             : }
    6022             : 
    6023         259 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    6024             :   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
    6025             :   assert(AtomicNode->isCompareAndSwap());
    6026             :   unsigned AS = AtomicNode->getAddressSpace();
    6027             : 
    6028             :   // No custom lowering required for local address space
    6029         259 :   if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
    6030          64 :     return Op;
    6031             : 
    6032             :   // Non-local address space requires custom lowering for atomic compare
    6033             :   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
    6034             :   SDLoc DL(Op);
    6035         195 :   SDValue ChainIn = Op.getOperand(0);
    6036         195 :   SDValue Addr = Op.getOperand(1);
    6037         195 :   SDValue Old = Op.getOperand(2);
    6038         195 :   SDValue New = Op.getOperand(3);
    6039         195 :   EVT VT = Op.getValueType();
    6040         195 :   MVT SimpleVT = VT.getSimpleVT();
    6041         195 :   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
    6042             : 
    6043         390 :   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
    6044         195 :   SDValue Ops[] = { ChainIn, Addr, NewOld };
    6045             : 
    6046             :   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
    6047         585 :                                  Ops, VT, AtomicNode->getMemOperand());
    6048             : }
    6049             : 
    6050             : //===----------------------------------------------------------------------===//
    6051             : // Custom DAG optimizations
    6052             : //===----------------------------------------------------------------------===//
    6053             : 
    6054        1680 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
    6055             :                                                      DAGCombinerInfo &DCI) const {
    6056        3360 :   EVT VT = N->getValueType(0);
    6057        1680 :   EVT ScalarVT = VT.getScalarType();
    6058        1680 :   if (ScalarVT != MVT::f32)
    6059         199 :     return SDValue();
    6060             : 
    6061        1481 :   SelectionDAG &DAG = DCI.DAG;
    6062             :   SDLoc DL(N);
    6063             : 
    6064        1481 :   SDValue Src = N->getOperand(0);
    6065             :   EVT SrcVT = Src.getValueType();
    6066             : 
    6067             :   // TODO: We could try to match extracting the higher bytes, which would be
    6068             :   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
    6069             :   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
    6070             :   // about in practice.
    6071        1481 :   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
    6072        1156 :     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
    6073         106 :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
    6074         106 :       DCI.AddToWorklist(Cvt.getNode());
    6075         106 :       return Cvt;
    6076             :     }
    6077             :   }
    6078             : 
    6079        1375 :   return SDValue();
    6080             : }
    6081             : 
    6082             : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
    6083             : 
    6084             : // This is a variant of
    6085             : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
    6086             : //
    6087             : // The normal DAG combiner will do this, but only if the add has one use since
    6088             : // that would increase the number of instructions.
    6089             : //
    6090             : // This prevents us from seeing a constant offset that can be folded into a
    6091             : // memory instruction's addressing mode. If we know the resulting add offset of
    6092             : // a pointer can be folded into an addressing offset, we can replace the pointer
    6093             : // operand with the add of new constant offset. This eliminates one of the uses,
    6094             : // and may allow the remaining use to also be simplified.
    6095             : //
    6096         203 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    6097             :                                                unsigned AddrSpace,
    6098             :                                                EVT MemVT,
    6099             :                                                DAGCombinerInfo &DCI) const {
    6100         203 :   SDValue N0 = N->getOperand(0);
    6101         203 :   SDValue N1 = N->getOperand(1);
    6102             : 
    6103             :   // We only do this to handle cases where it's profitable when there are
    6104             :   // multiple uses of the add, so defer to the standard combine.
    6105         203 :   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
    6106             :       N0->hasOneUse())
    6107         157 :     return SDValue();
    6108             : 
    6109             :   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
    6110             :   if (!CN1)
    6111           0 :     return SDValue();
    6112             : 
    6113             :   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    6114             :   if (!CAdd)
    6115           2 :     return SDValue();
    6116             : 
    6117             :   // If the resulting offset is too large, we can't fold it into the addressing
    6118             :   // mode offset.
    6119          88 :   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    6120          44 :   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
    6121             : 
    6122          44 :   AddrMode AM;
    6123          44 :   AM.HasBaseReg = true;
    6124          44 :   AM.BaseOffs = Offset.getSExtValue();
    6125          88 :   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
    6126          14 :     return SDValue();
    6127             : 
    6128          30 :   SelectionDAG &DAG = DCI.DAG;
    6129             :   SDLoc SL(N);
    6130          60 :   EVT VT = N->getValueType(0);
    6131             : 
    6132          30 :   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
    6133          30 :   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
    6134             : 
    6135             :   SDNodeFlags Flags;
    6136          30 :   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
    6137           0 :                           (N0.getOpcode() == ISD::OR ||
    6138           0 :                            N0->getFlags().hasNoUnsignedWrap()));
    6139             : 
    6140          30 :   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
    6141             : }
    6142             : 
    6143      331174 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
    6144             :                                                   DAGCombinerInfo &DCI) const {
    6145      331174 :   SDValue Ptr = N->getBasePtr();
    6146      331174 :   SelectionDAG &DAG = DCI.DAG;
    6147             :   SDLoc SL(N);
    6148             : 
    6149             :   // TODO: We could also do this for multiplies.
    6150      331174 :   if (Ptr.getOpcode() == ISD::SHL) {
    6151             :     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
    6152         203 :                                           N->getMemoryVT(), DCI);
    6153         203 :     if (NewPtr) {
    6154          30 :       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
    6155             : 
    6156          60 :       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
    6157          30 :       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
    6158             :     }
    6159             :   }
    6160             : 
    6161      331144 :   return SDValue();
    6162             : }
    6163             : 
    6164             : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
    6165        4952 :   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
    6166        4018 :          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
    6167        2019 :          (Opc == ISD::XOR && Val == 0);
    6168             : }
    6169             : 
    6170             : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
    6171             : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
    6172             : // integer combine opportunities since most 64-bit operations are decomposed
    6173             : // this way.  TODO: We won't want this for SALU especially if it is an inline
    6174             : // immediate.
    6175        1999 : SDValue SITargetLowering::splitBinaryBitConstantOp(
    6176             :   DAGCombinerInfo &DCI,
    6177             :   const SDLoc &SL,
    6178             :   unsigned Opc, SDValue LHS,
    6179             :   const ConstantSDNode *CRHS) const {
    6180        1999 :   uint64_t Val = CRHS->getZExtValue();
    6181             :   uint32_t ValLo = Lo_32(Val);
    6182             :   uint32_t ValHi = Hi_32(Val);
    6183        1999 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6184             : 
    6185             :     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
    6186             :          bitOpWithConstantIsReducible(Opc, ValHi)) ||
    6187         378 :         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
    6188             :     // If we need to materialize a 64-bit immediate, it will be split up later
    6189             :     // anyway. Avoid creating the harder to understand 64-bit immediate
    6190             :     // materialization.
    6191        1628 :     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
    6192             :   }
    6193             : 
    6194         371 :   return SDValue();
    6195             : }
    6196             : 
    6197             : // Returns true if argument is a boolean value which is not serialized into
    6198             : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
    6199             : static bool isBoolSGPR(SDValue V) {
    6200             :   if (V.getValueType() != MVT::i1)
    6201             :     return false;
    6202         240 :   switch (V.getOpcode()) {
    6203             :   default: break;
    6204             :   case ISD::SETCC:
    6205             :   case ISD::AND:
    6206             :   case ISD::OR:
    6207             :   case ISD::XOR:
    6208             :   case AMDGPUISD::FP_CLASS:
    6209             :     return true;
    6210             :   }
    6211             :   return false;
    6212             : }
    6213             : 
    6214             : // If a constant has all zeroes or all ones within each byte return it.
    6215             : // Otherwise return 0.
    6216         318 : static uint32_t getConstantPermuteMask(uint32_t C) {
    6217             :   // 0xff for any zero byte in the mask
    6218             :   uint32_t ZeroByteMask = 0;
    6219         318 :   if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
    6220         318 :   if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
    6221         318 :   if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
    6222         318 :   if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
    6223         318 :   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
    6224         318 :   if ((NonZeroByteMask & C) != NonZeroByteMask)
    6225             :     return 0; // Partial bytes selected.
    6226         306 :   return C;
    6227             : }
    6228             : 
    6229             : // Check if a node selects whole bytes from its operand 0 starting at a byte
    6230             : // boundary while masking the rest. Returns select mask as in the v_perm_b32
    6231             : // or -1 if not succeeded.
    6232             : // Note byte select encoding:
    6233             : // value 0-3 selects corresponding source byte;
    6234             : // value 0xc selects zero;
    6235             : // value 0xff selects 0xff.
    6236        2360 : static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
    6237             :   assert(V.getValueSizeInBits() == 32);
    6238             : 
    6239        2360 :   if (V.getNumOperands() != 2)
    6240             :     return ~0;
    6241             : 
    6242             :   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
    6243             :   if (!N1)
    6244             :     return ~0;
    6245             : 
    6246        2256 :   uint32_t C = N1->getZExtValue();
    6247             : 
    6248        1128 :   switch (V.getOpcode()) {
    6249             :   default:
    6250             :     break;
    6251         311 :   case ISD::AND:
    6252         311 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6253         299 :       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
    6254             :     }
    6255             :     break;
    6256             : 
    6257           2 :   case ISD::OR:
    6258           2 :     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
    6259           2 :       return (0x03020100 & ~ConstMask) | ConstMask;
    6260             :     }
    6261             :     break;
    6262             : 
    6263         722 :   case ISD::SHL:
    6264         722 :     if (C % 8)
    6265             :       return ~0;
    6266             : 
    6267         648 :     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
    6268             : 
    6269          26 :   case ISD::SRL:
    6270          26 :     if (C % 8)
    6271             :       return ~0;
    6272             : 
    6273          26 :     return uint32_t(0x0c0c0c0c03020100ull >> C);
    6274             :   }
    6275             : 
    6276             :   return ~0;
    6277             : }
    6278             : 
    6279       30067 : SDValue SITargetLowering::performAndCombine(SDNode *N,
    6280             :                                             DAGCombinerInfo &DCI) const {
    6281       30067 :   if (DCI.isBeforeLegalize())
    6282        1075 :     return SDValue();
    6283             : 
    6284       28992 :   SelectionDAG &DAG = DCI.DAG;
    6285       57984 :   EVT VT = N->getValueType(0);
    6286       28992 :   SDValue LHS = N->getOperand(0);
    6287       28992 :   SDValue RHS = N->getOperand(1);
    6288             : 
    6289             : 
    6290             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6291        1821 :   if (VT == MVT::i64 && CRHS) {
    6292        1609 :     if (SDValue Split
    6293        3218 :         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
    6294        1525 :       return Split;
    6295             :   }
    6296             : 
    6297       27467 :   if (CRHS && VT == MVT::i32) {
    6298             :     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
    6299             :     // nb = number of trailing zeroes in mask
    6300             :     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
    6301             :     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
    6302       23537 :     uint64_t Mask = CRHS->getZExtValue();
    6303             :     unsigned Bits = countPopulation(Mask);
    6304       32608 :     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
    6305       26027 :         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
    6306          54 :       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
    6307         108 :         unsigned Shift = CShift->getZExtValue();
    6308         108 :         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
    6309          54 :         unsigned Offset = NB + Shift;
    6310          54 :         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
    6311             :           SDLoc SL(N);
    6312             :           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    6313          54 :                                     LHS->getOperand(0),
    6314             :                                     DAG.getConstant(Offset, SL, MVT::i32),
    6315         162 :                                     DAG.getConstant(Bits, SL, MVT::i32));
    6316          54 :           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
    6317             :           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
    6318          54 :                                     DAG.getValueType(NarrowVT));
    6319          54 :           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
    6320         162 :                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
    6321          54 :           return Shl;
    6322             :         }
    6323             :       }
    6324             :     }
    6325             : 
    6326             :     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    6327       35835 :     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
    6328             :         isa<ConstantSDNode>(LHS.getOperand(2))) {
    6329           2 :       uint32_t Sel = getConstantPermuteMask(Mask);
    6330           2 :       if (!Sel)
    6331           0 :         return SDValue();
    6332             : 
    6333             :       // Select 0xc for all zero bytes
    6334           2 :       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
    6335             :       SDLoc DL(N);
    6336             :       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    6337           4 :                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    6338             :     }
    6339             :   }
    6340             : 
    6341             :   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
    6342             :   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
    6343       27720 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
    6344         309 :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    6345         309 :     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
    6346             : 
    6347         309 :     SDValue X = LHS.getOperand(0);
    6348         309 :     SDValue Y = RHS.getOperand(0);
    6349         309 :     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
    6350         303 :       return SDValue();
    6351             : 
    6352           6 :     if (LCC == ISD::SETO) {
    6353             :       if (X != LHS.getOperand(1))
    6354           0 :         return SDValue();
    6355             : 
    6356           4 :       if (RCC == ISD::SETUNE) {
    6357             :         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
    6358           6 :         if (!C1 || !C1->isInfinity() || C1->isNegative())
    6359           0 :           return SDValue();
    6360             : 
    6361             :         const uint32_t Mask = SIInstrFlags::N_NORMAL |
    6362             :                               SIInstrFlags::N_SUBNORMAL |
    6363             :                               SIInstrFlags::N_ZERO |
    6364             :                               SIInstrFlags::P_ZERO |
    6365             :                               SIInstrFlags::P_SUBNORMAL |
    6366             :                               SIInstrFlags::P_NORMAL;
    6367             : 
    6368             :         static_assert(((~(SIInstrFlags::S_NAN |
    6369             :                           SIInstrFlags::Q_NAN |
    6370             :                           SIInstrFlags::N_INFINITY |
    6371             :                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
    6372             :                       "mask not equal");
    6373             : 
    6374             :         SDLoc DL(N);
    6375             :         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6376           4 :                            X, DAG.getConstant(Mask, DL, MVT::i32));
    6377             :       }
    6378             :     }
    6379             :   }
    6380             : 
    6381       23643 :   if (VT == MVT::i32 &&
    6382       23642 :       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
    6383             :     // and x, (sext cc from i1) => select cc, x, 0
    6384          24 :     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
    6385             :       std::swap(LHS, RHS);
    6386          24 :     if (isBoolSGPR(RHS.getOperand(0)))
    6387          16 :       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
    6388          64 :                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
    6389             :   }
    6390             : 
    6391             :   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    6392       27090 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6393       37696 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    6394         498 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    6395         238 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    6396         238 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    6397         238 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    6398             :       // Canonicalize the expression in an attempt to have fewer unique masks
    6399             :       // and therefore fewer registers used to hold the masks.
    6400           1 :       if (LHSMask > RHSMask) {
    6401             :         std::swap(LHSMask, RHSMask);
    6402             :         std::swap(LHS, RHS);
    6403             :       }
    6404             : 
    6405             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    6406             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    6407           1 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6408           1 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6409             : 
    6410             :       // Check of we need to combine values from two sources within a byte.
    6411           2 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    6412             :           // If we select high and lower word keep it for SDWA.
    6413             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    6414           1 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    6415             :         // Each byte in each mask is either selector mask 0-3, or has higher
    6416             :         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
    6417             :         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
    6418             :         // mask which is not 0xff wins. By anding both masks we have a correct
    6419             :         // result except that 0x0c shall be corrected to give 0x0c only.
    6420           1 :         uint32_t Mask = LHSMask & RHSMask;
    6421           9 :         for (unsigned I = 0; I < 32; I += 8) {
    6422           4 :           uint32_t ByteSel = 0xff << I;
    6423           4 :           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
    6424           0 :             Mask &= (0x0c << I) & 0xffffffff;
    6425             :         }
    6426             : 
    6427             :         // Add 4 to each active LHS lane. It will not affect any existing 0xff
    6428             :         // or 0x0c.
    6429           1 :         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
    6430             :         SDLoc DL(N);
    6431             : 
    6432             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    6433             :                            LHS.getOperand(0), RHS.getOperand(0),
    6434           2 :                            DAG.getConstant(Sel, DL, MVT::i32));
    6435             :       }
    6436             :     }
    6437             :   }
    6438             : 
    6439       27089 :   return SDValue();
    6440             : }
    6441             : 
    6442       15354 : SDValue SITargetLowering::performOrCombine(SDNode *N,
    6443             :                                            DAGCombinerInfo &DCI) const {
    6444       15354 :   SelectionDAG &DAG = DCI.DAG;
    6445       15354 :   SDValue LHS = N->getOperand(0);
    6446       15354 :   SDValue RHS = N->getOperand(1);
    6447             : 
    6448       15354 :   EVT VT = N->getValueType(0);
    6449             :   if (VT == MVT::i1) {
    6450             :     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
    6451         139 :     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    6452             :         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
    6453          14 :       SDValue Src = LHS.getOperand(0);
    6454             :       if (Src != RHS.getOperand(0))
    6455           1 :         return SDValue();
    6456             : 
    6457             :       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    6458             :       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    6459          13 :       if (!CLHS || !CRHS)
    6460           0 :         return SDValue();
    6461             : 
    6462             :       // Only 10 bits are used.
    6463             :       static const uint32_t MaxMask = 0x3ff;
    6464             : 
    6465          39 :       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
    6466             :       SDLoc DL(N);
    6467             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    6468          26 :                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
    6469             :     }
    6470             : 
    6471         111 :     return SDValue();
    6472             :   }
    6473             : 
    6474             :   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
    6475        5885 :   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
    6476             :       LHS.getOpcode() == AMDGPUISD::PERM &&
    6477             :       isa<ConstantSDNode>(LHS.getOperand(2))) {
    6478           3 :     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
    6479           3 :     if (!Sel)
    6480           0 :       return SDValue();
    6481             : 
    6482           3 :     Sel |= LHS.getConstantOperandVal(2);
    6483             :     SDLoc DL(N);
    6484             :     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
    6485           6 :                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
    6486             :   }
    6487             : 
    6488             :   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
    6489       15226 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6490       27996 :   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
    6491        1781 :       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
    6492         942 :     uint32_t LHSMask = getPermuteMask(DAG, LHS);
    6493         942 :     uint32_t RHSMask = getPermuteMask(DAG, RHS);
    6494         942 :     if (LHSMask != ~0u && RHSMask != ~0u) {
    6495             :       // Canonicalize the expression in an attempt to have fewer unique masks
    6496             :       // and therefore fewer registers used to hold the masks.
    6497         106 :       if (LHSMask > RHSMask) {
    6498             :         std::swap(LHSMask, RHSMask);
    6499             :         std::swap(LHS, RHS);
    6500             :       }
    6501             : 
    6502             :       // Select 0xc for each lane used from source operand. Zero has 0xc mask
    6503             :       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
    6504         106 :       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6505         106 :       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
    6506             : 
    6507             :       // Check of we need to combine values from two sources within a byte.
    6508         212 :       if (!(LHSUsedLanes & RHSUsedLanes) &&
    6509             :           // If we select high and lower word keep it for SDWA.
    6510             :           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
    6511         106 :           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
    6512             :         // Kill zero bytes selected by other mask. Zero value is 0xc.
    6513             :         LHSMask &= ~RHSUsedLanes;
    6514          12 :         RHSMask &= ~LHSUsedLanes;
    6515             :         // Add 4 to each active LHS lane
    6516          12 :         LHSMask |= LHSUsedLanes & 0x04040404;
    6517             :         // Combine masks
    6518          12 :         uint32_t Sel = LHSMask | RHSMask;
    6519             :         SDLoc DL(N);
    6520             : 
    6521             :         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
    6522             :                            LHS.getOperand(0), RHS.getOperand(0),
    6523          24 :                            DAG.getConstant(Sel, DL, MVT::i32));
    6524             :       }
    6525             :     }
    6526             :   }
    6527             : 
    6528             :   if (VT != MVT::i64)
    6529       13084 :     return SDValue();
    6530             : 
    6531             :   // TODO: This could be a generic combine with a predicate for extracting the
    6532             :   // high half of an integer being free.
    6533             : 
    6534             :   // (or i64:x, (zero_extend i32:y)) ->
    6535             :   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
    6536        3358 :   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
    6537             :       RHS.getOpcode() != ISD::ZERO_EXTEND)
    6538             :     std::swap(LHS, RHS);
    6539             : 
    6540        2130 :   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
    6541        1255 :     SDValue ExtSrc = RHS.getOperand(0);
    6542             :     EVT SrcVT = ExtSrc.getValueType();
    6543             :     if (SrcVT == MVT::i32) {
    6544             :       SDLoc SL(N);
    6545             :       SDValue LowLHS, HiBits;
    6546        2510 :       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
    6547        1255 :       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
    6548             : 
    6549        1255 :       DCI.AddToWorklist(LowOr.getNode());
    6550        1255 :       DCI.AddToWorklist(HiBits.getNode());
    6551             : 
    6552             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    6553        1255 :                                 LowOr, HiBits);
    6554        1255 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    6555             :     }
    6556             :   }
    6557             : 
    6558         875 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    6559             :   if (CRHS) {
    6560          81 :     if (SDValue Split
    6561         162 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
    6562          77 :       return Split;
    6563             :   }
    6564             : 
    6565         798 :   return SDValue();
    6566             : }
    6567             : 
    6568        1870 : SDValue SITargetLowering::performXorCombine(SDNode *N,
    6569             :                                             DAGCombinerInfo &DCI) const {
    6570        1870 :   EVT VT = N->getValueType(0);
    6571             :   if (VT != MVT::i64)
    6572        1273 :     return SDValue();
    6573             : 
    6574         597 :   SDValue LHS = N->getOperand(0);
    6575         597 :   SDValue RHS = N->getOperand(1);
    6576             : 
    6577             :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    6578             :   if (CRHS) {
    6579         309 :     if (SDValue Split
    6580         618 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
    6581          26 :       return Split;
    6582             :   }
    6583             : 
    6584         571 :   return SDValue();
    6585             : }
    6586             : 
    6587             : // Instructions that will be lowered with a final instruction that zeros the
    6588             : // high result bits.
    6589             : // XXX - probably only need to list legal operations.
    6590         248 : static bool fp16SrcZerosHighBits(unsigned Opc) {
    6591         248 :   switch (Opc) {
    6592             :   case ISD::FADD:
    6593             :   case ISD::FSUB:
    6594             :   case ISD::FMUL:
    6595             :   case ISD::FDIV:
    6596             :   case ISD::FREM:
    6597             :   case ISD::FMA:
    6598             :   case ISD::FMAD:
    6599             :   case ISD::FCANONICALIZE:
    6600             :   case ISD::FP_ROUND:
    6601             :   case ISD::UINT_TO_FP:
    6602             :   case ISD::SINT_TO_FP:
    6603             :   case ISD::FABS:
    6604             :     // Fabs is lowered to a bit operation, but it's an and which will clear the
    6605             :     // high bits anyway.
    6606             :   case ISD::FSQRT:
    6607             :   case ISD::FSIN:
    6608             :   case ISD::FCOS:
    6609             :   case ISD::FPOWI:
    6610             :   case ISD::FPOW:
    6611             :   case ISD::FLOG:
    6612             :   case ISD::FLOG2:
    6613             :   case ISD::FLOG10:
    6614             :   case ISD::FEXP:
    6615             :   case ISD::FEXP2:
    6616             :   case ISD::FCEIL:
    6617             :   case ISD::FTRUNC:
    6618             :   case ISD::FRINT:
    6619             :   case ISD::FNEARBYINT:
    6620             :   case ISD::FROUND:
    6621             :   case ISD::FFLOOR:
    6622             :   case ISD::FMINNUM:
    6623             :   case ISD::FMAXNUM:
    6624             :   case AMDGPUISD::FRACT:
    6625             :   case AMDGPUISD::CLAMP:
    6626             :   case AMDGPUISD::COS_HW:
    6627             :   case AMDGPUISD::SIN_HW:
    6628             :   case AMDGPUISD::FMIN3:
    6629             :   case AMDGPUISD::FMAX3:
    6630             :   case AMDGPUISD::FMED3:
    6631             :   case AMDGPUISD::FMAD_FTZ:
    6632             :   case AMDGPUISD::RCP:
    6633             :   case AMDGPUISD::RSQ:
    6634             :   case AMDGPUISD::RCP_IFLAG:
    6635             :   case AMDGPUISD::LDEXP:
    6636             :     return true;
    6637          35 :   default:
    6638             :     // fcopysign, select and others may be lowered to 32-bit bit operations
    6639             :     // which don't zero the high bits.
    6640          35 :     return false;
    6641             :   }
    6642             : }
    6643             : 
    6644       19383 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
    6645             :                                                    DAGCombinerInfo &DCI) const {
    6646       33820 :   if (!Subtarget->has16BitInsts() ||
    6647       14437 :       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6648       16054 :     return SDValue();
    6649             : 
    6650        6658 :   EVT VT = N->getValueType(0);
    6651             :   if (VT != MVT::i32)
    6652        1748 :     return SDValue();
    6653             : 
    6654        1581 :   SDValue Src = N->getOperand(0);
    6655             :   if (Src.getValueType() != MVT::i16)
    6656         218 :     return SDValue();
    6657             : 
    6658             :   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
    6659             :   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
    6660        1363 :   if (Src.getOpcode() == ISD::BITCAST) {
    6661         248 :     SDValue BCSrc = Src.getOperand(0);
    6662         248 :     if (BCSrc.getValueType() == MVT::f16 &&
    6663         248 :         fp16SrcZerosHighBits(BCSrc.getOpcode()))
    6664         639 :       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
    6665             :   }
    6666             : 
    6667        1150 :   return SDValue();
    6668             : }
    6669             : 
    6670          81 : SDValue SITargetLowering::performClassCombine(SDNode *N,
    6671             :                                               DAGCombinerInfo &DCI) const {
    6672          81 :   SelectionDAG &DAG = DCI.DAG;
    6673          81 :   SDValue Mask = N->getOperand(1);
    6674             : 
    6675             :   // fp_class x, 0 -> false
    6676             :   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
    6677         126 :     if (CMask->isNullValue())
    6678           4 :       return DAG.getConstant(0, SDLoc(N), MVT::i1);
    6679             :   }
    6680             : 
    6681         158 :   if (N->getOperand(0).isUndef())
    6682           2 :     return DAG.getUNDEF(MVT::i1);
    6683             : 
    6684          77 :   return SDValue();
    6685             : }
    6686             : 
    6687         728 : SDValue SITargetLowering::performRcpCombine(SDNode *N,
    6688             :                                             DAGCombinerInfo &DCI) const {
    6689        1456 :   EVT VT = N->getValueType(0);
    6690         728 :   SDValue N0 = N->getOperand(0);
    6691             : 
    6692         728 :   if (N0.isUndef())
    6693           1 :     return N0;
    6694             : 
    6695         631 :   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
    6696             :                          N0.getOpcode() == ISD::SINT_TO_FP)) {
    6697         410 :     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
    6698         205 :                            N->getFlags());
    6699             :   }
    6700             : 
    6701         522 :   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
    6702             : }
    6703             : 
    6704             : static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
    6705          61 :   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
    6706             :     return true;
    6707             : 
    6708          29 :   return DAG.isKnownNeverNaN(Op);
    6709             : }
    6710             : 
    6711         452 : static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
    6712             :                             const SISubtarget *ST, unsigned MaxDepth=5) {
    6713             :   // If source is a result of another standard FP operation it is already in
    6714             :   // canonical form.
    6715             : 
    6716         452 :   switch (Op.getOpcode()) {
    6717             :   default:
    6718             :     break;
    6719             : 
    6720             :   // These will flush denorms if required.
    6721             :   case ISD::FADD:
    6722             :   case ISD::FSUB:
    6723             :   case ISD::FMUL:
    6724             :   case ISD::FSQRT:
    6725             :   case ISD::FCEIL:
    6726             :   case ISD::FFLOOR:
    6727             :   case ISD::FMA:
    6728             :   case ISD::FMAD:
    6729             : 
    6730             :   case ISD::FCANONICALIZE:
    6731             :     return true;
    6732             : 
    6733             :   case ISD::FP_ROUND:
    6734          36 :     return Op.getValueType().getScalarType() != MVT::f16 ||
    6735          16 :            ST->hasFP16Denormals();
    6736             : 
    6737             :   case ISD::FP_EXTEND:
    6738          20 :     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
    6739           4 :            ST->hasFP16Denormals();
    6740             : 
    6741           0 :   case ISD::FP16_TO_FP:
    6742             :   case ISD::FP_TO_FP16:
    6743           0 :     return ST->hasFP16Denormals();
    6744             : 
    6745             :   // It can/will be lowered or combined as a bit operation.
    6746             :   // Need to check their input recursively to handle.
    6747          82 :   case ISD::FNEG:
    6748             :   case ISD::FABS:
    6749         164 :     return (MaxDepth > 0) &&
    6750         164 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
    6751             : 
    6752             :   case ISD::FSIN:
    6753             :   case ISD::FCOS:
    6754             :   case ISD::FSINCOS:
    6755          32 :     return Op.getValueType().getScalarType() != MVT::f16;
    6756             : 
    6757             :   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
    6758             :   // For such targets need to check their input recursively.
    6759          44 :   case ISD::FMINNUM:
    6760             :   case ISD::FMAXNUM:
    6761             :   case ISD::FMINNAN:
    6762             :   case ISD::FMAXNAN:
    6763             : 
    6764          22 :     if (ST->supportsMinMaxDenormModes() &&
    6765          66 :         DAG.isKnownNeverNaN(Op.getOperand(0)) &&
    6766           0 :         DAG.isKnownNeverNaN(Op.getOperand(1)))
    6767             :       return true;
    6768             : 
    6769          44 :     return (MaxDepth > 0) &&
    6770         100 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
    6771          12 :            isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
    6772             : 
    6773             :   case ISD::ConstantFP: {
    6774          12 :     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
    6775          24 :     return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
    6776             :   }
    6777             :   }
    6778             :   return false;
    6779             : }
    6780             : 
    6781             : // Constant fold canonicalize.
    6782         426 : SDValue SITargetLowering::performFCanonicalizeCombine(
    6783             :   SDNode *N,
    6784             :   DAGCombinerInfo &DCI) const {
    6785         426 :   SelectionDAG &DAG = DCI.DAG;
    6786         852 :   ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
    6787             : 
    6788         426 :   if (!CFP) {
    6789         326 :     SDValue N0 = N->getOperand(0);
    6790         326 :     EVT VT = N0.getValueType().getScalarType();
    6791         326 :     auto ST = getSubtarget();
    6792             : 
    6793         172 :     if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
    6794          36 :          (VT == MVT::f64 && ST->hasFP64Denormals()) ||
    6795         292 :          (VT == MVT::f16 && ST->hasFP16Denormals())) &&
    6796         174 :         DAG.isKnownNeverNaN(N0))
    6797          10 :       return N0;
    6798             : 
    6799         316 :     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
    6800             : 
    6801         633 :     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
    6802         314 :         isCanonicalized(DAG, N0, ST))
    6803          94 :       return N0;
    6804             : 
    6805         222 :     return SDValue();
    6806             :   }
    6807             : 
    6808         100 :   const APFloat &C = CFP->getValueAPF();
    6809             : 
    6810             :   // Flush denormals to 0 if not enabled.
    6811         100 :   if (C.isDenormal()) {
    6812          48 :     EVT VT = N->getValueType(0);
    6813          24 :     EVT SVT = VT.getScalarType();
    6814           4 :     if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
    6815           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6816             : 
    6817           4 :     if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
    6818           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6819             : 
    6820          16 :     if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
    6821           0 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    6822             :   }
    6823             : 
    6824          96 :   if (C.isNaN()) {
    6825          84 :     EVT VT = N->getValueType(0);
    6826             :     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
    6827          42 :     if (C.isSignaling()) {
    6828             :       // Quiet a signaling NaN.
    6829          44 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6830             :     }
    6831             : 
    6832             :     // Make sure it is the canonical NaN bitpattern.
    6833             :     //
    6834             :     // TODO: Can we use -1 as the canonical NaN value since it's an inline
    6835             :     // immediate?
    6836          60 :     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
    6837          28 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    6838             :   }
    6839             : 
    6840          60 :   return N->getOperand(0);
    6841             : }
    6842             : 
    6843             : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
    6844          59 :   switch (Opc) {
    6845             :   case ISD::FMAXNUM:
    6846             :     return AMDGPUISD::FMAX3;
    6847           5 :   case ISD::SMAX:
    6848             :     return AMDGPUISD::SMAX3;
    6849           5 :   case ISD::UMAX:
    6850             :     return AMDGPUISD::UMAX3;
    6851          12 :   case ISD::FMINNUM:
    6852             :     return AMDGPUISD::FMIN3;
    6853          15 :   case ISD::SMIN:
    6854             :     return AMDGPUISD::SMIN3;
    6855           8 :   case ISD::UMIN:
    6856             :     return AMDGPUISD::UMIN3;
    6857           0 :   default:
    6858           0 :     llvm_unreachable("Not a min/max opcode");
    6859             :   }
    6860             : }
    6861             : 
    6862         150 : SDValue SITargetLowering::performIntMed3ImmCombine(
    6863             :   SelectionDAG &DAG, const SDLoc &SL,
    6864             :   SDValue Op0, SDValue Op1, bool Signed) const {
    6865             :   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
    6866             :   if (!K1)
    6867          90 :     return SDValue();
    6868             : 
    6869             :   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
    6870             :   if (!K0)
    6871           3 :     return SDValue();
    6872             : 
    6873          57 :   if (Signed) {
    6874         144 :     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
    6875           3 :       return SDValue();
    6876             :   } else {
    6877          27 :     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
    6878           3 :       return SDValue();
    6879             :   }
    6880             : 
    6881         102 :   EVT VT = K0->getValueType(0);
    6882          51 :   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
    6883           8 :   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
    6884             :     return DAG.getNode(Med3Opc, SL, VT,
    6885          49 :                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
    6886             :   }
    6887             : 
    6888             :   // If there isn't a 16-bit med3 operation, convert to 32-bit.
    6889             :   MVT NVT = MVT::i32;
    6890           2 :   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    6891             : 
    6892           2 :   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
    6893           4 :   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
    6894           2 :   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
    6895             : 
    6896           2 :   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
    6897           2 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
    6898             : }
    6899             : 
    6900         778 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
    6901             :   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
    6902             :     return C;
    6903             : 
    6904             :   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
    6905          43 :     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
    6906             :       return C;
    6907             :   }
    6908             : 
    6909             :   return nullptr;
    6910             : }
    6911             : 
    6912         455 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    6913             :                                                   const SDLoc &SL,
    6914             :                                                   SDValue Op0,
    6915             :                                                   SDValue Op1) const {
    6916         455 :   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
    6917         455 :   if (!K1)
    6918         132 :     return SDValue();
    6919             : 
    6920         323 :   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
    6921         323 :   if (!K0)
    6922           3 :     return SDValue();
    6923             : 
    6924             :   // Ordered >= (although NaN inputs should have folded away by now).
    6925         960 :   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
    6926         320 :   if (Cmp == APFloat::cmpGreaterThan)
    6927           8 :     return SDValue();
    6928             : 
    6929             :   // TODO: Check IEEE bit enabled?
    6930         624 :   EVT VT = Op0.getValueType();
    6931         312 :   if (Subtarget->enableDX10Clamp()) {
    6932             :     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
    6933             :     // hardware fmed3 behavior converting to a min.
    6934             :     // FIXME: Should this be allowing -0.0?
    6935         859 :     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
    6936         250 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
    6937             :   }
    6938             : 
    6939             :   // med3 for f16 is only available on gfx9+, and not available for v2f16.
    6940          10 :   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
    6941             :     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    6942             :     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
    6943             :     // then give the other result, which is different from med3 with a NaN
    6944             :     // input.
    6945          53 :     SDValue Var = Op0.getOperand(0);
    6946          26 :     if (!isKnownNeverSNan(DAG, Var))
    6947          15 :       return SDValue();
    6948             : 
    6949             :     return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
    6950          76 :                        Var, SDValue(K0, 0), SDValue(K1, 0));
    6951             :   }
    6952             : 
    6953           9 :   return SDValue();
    6954             : }
    6955             : 
    6956        3662 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    6957             :                                                DAGCombinerInfo &DCI) const {
    6958        3662 :   SelectionDAG &DAG = DCI.DAG;
    6959             : 
    6960        7324 :   EVT VT = N->getValueType(0);
    6961        3662 :   unsigned Opc = N->getOpcode();
    6962        3662 :   SDValue Op0 = N->getOperand(0);
    6963        3662 :   SDValue Op1 = N->getOperand(1);
    6964             : 
    6965             :   // Only do this if the inner op has one use since this will just increases
    6966             :   // register pressure for no benefit.
    6967             : 
    6968             : 
    6969        7274 :   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
    6970        3662 :       !VT.isVector() && VT != MVT::f64 &&
    6971         638 :       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
    6972             :     // max(max(a, b), c) -> max3(a, b, c)
    6973             :     // min(min(a, b), c) -> min3(a, b, c)
    6974        3131 :     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
    6975             :       SDLoc DL(N);
    6976             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    6977             :                          DL,
    6978             :                          N->getValueType(0),
    6979             :                          Op0.getOperand(0),
    6980             :                          Op0.getOperand(1),
    6981          78 :                          Op1);
    6982             :     }
    6983             : 
    6984             :     // Try commuted.
    6985             :     // max(a, max(b, c)) -> max3(a, b, c)
    6986             :     // min(a, min(b, c)) -> min3(a, b, c)
    6987        3066 :     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
    6988             :       SDLoc DL(N);
    6989             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    6990             :                          DL,
    6991             :                          N->getValueType(0),
    6992             :                          Op0,
    6993             :                          Op1.getOperand(0),
    6994          40 :                          Op1.getOperand(1));
    6995             :     }
    6996             :   }
    6997             : 
    6998             :   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
    6999        4243 :   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
    7000         180 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
    7001          45 :       return Med3;
    7002             :   }
    7003             : 
    7004        4093 :   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
    7005         120 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
    7006           6 :       return Med3;
    7007             :   }
    7008             : 
    7009             :   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
    7010         988 :   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
    7011          21 :        (Opc == AMDGPUISD::FMIN_LEGACY &&
    7012             :         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
    7013             :       (VT == MVT::f32 || VT == MVT::f64 ||
    7014          72 :        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
    7015        4046 :        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
    7016             :       Op0.hasOneUse()) {
    7017         910 :     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
    7018         288 :       return Res;
    7019             :   }
    7020             : 
    7021        3264 :   return SDValue();
    7022             : }
    7023             : 
    7024         172 : static bool isClampZeroToOne(SDValue A, SDValue B) {
    7025             :   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
    7026             :     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
    7027             :       // FIXME: Should this be allowing -0.0?
    7028         259 :       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
    7029          74 :              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
    7030             :     }
    7031             :   }
    7032             : 
    7033             :   return false;
    7034             : }
    7035             : 
    7036             : // FIXME: Should only worry about snans for version with chain.
    7037         113 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
    7038             :                                               DAGCombinerInfo &DCI) const {
    7039         226 :   EVT VT = N->getValueType(0);
    7040             :   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
    7041             :   // NaNs. With a NaN input, the order of the operands may change the result.
    7042             : 
    7043         113 :   SelectionDAG &DAG = DCI.DAG;
    7044             :   SDLoc SL(N);
    7045             : 
    7046         113 :   SDValue Src0 = N->getOperand(0);
    7047         113 :   SDValue Src1 = N->getOperand(1);
    7048         113 :   SDValue Src2 = N->getOperand(2);
    7049             : 
    7050         113 :   if (isClampZeroToOne(Src0, Src1)) {
    7051             :     // const_a, const_b, x -> clamp is safe in all cases including signaling
    7052             :     // nans.
    7053             :     // FIXME: Should this be allowing -0.0?
    7054          36 :     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
    7055             :   }
    7056             : 
    7057             :   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
    7058             :   // handling no dx10-clamp?
    7059          77 :   if (Subtarget->enableDX10Clamp()) {
    7060             :     // If NaNs is clamped to 0, we are free to reorder the inputs.
    7061             : 
    7062             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7063             :       std::swap(Src0, Src1);
    7064             : 
    7065             :     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
    7066             :       std::swap(Src1, Src2);
    7067             : 
    7068             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    7069             :       std::swap(Src0, Src1);
    7070             : 
    7071          59 :     if (isClampZeroToOne(Src1, Src2))
    7072          12 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
    7073             :   }
    7074             : 
    7075          65 :   return SDValue();
    7076             : }
    7077             : 
    7078         139 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    7079             :                                                  DAGCombinerInfo &DCI) const {
    7080         139 :   SDValue Src0 = N->getOperand(0);
    7081         139 :   SDValue Src1 = N->getOperand(1);
    7082         156 :   if (Src0.isUndef() && Src1.isUndef())
    7083           6 :     return DCI.DAG.getUNDEF(N->getValueType(0));
    7084         136 :   return SDValue();
    7085             : }
    7086             : 
    7087      249795 : SDValue SITargetLowering::performExtractVectorEltCombine(
    7088             :   SDNode *N, DAGCombinerInfo &DCI) const {
    7089      249795 :   SDValue Vec = N->getOperand(0);
    7090      249795 :   SelectionDAG &DAG = DCI.DAG;
    7091             : 
    7092      249795 :   EVT VecVT = Vec.getValueType();
    7093      249795 :   EVT EltVT = VecVT.getVectorElementType();
    7094             : 
    7095      249723 :   if ((Vec.getOpcode() == ISD::FNEG ||
    7096      249905 :        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
    7097             :     SDLoc SL(N);
    7098         156 :     EVT EltVT = N->getValueType(0);
    7099          78 :     SDValue Idx = N->getOperand(1);
    7100             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7101          78 :                               Vec.getOperand(0), Idx);
    7102          78 :     return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
    7103             :   }
    7104             : 
    7105             :   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
    7106             :   //    =>
    7107             :   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
    7108             :   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
    7109             :   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
    7110      260279 :   if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
    7111             :     SDLoc SL(N);
    7112        2160 :     EVT EltVT = N->getValueType(0);
    7113        1080 :     SDValue Idx = N->getOperand(1);
    7114             :     unsigned Opc = Vec.getOpcode();
    7115             : 
    7116        1080 :     switch(Opc) {
    7117        1051 :     default:
    7118        1051 :       return SDValue();
    7119             :       // TODO: Support other binary operations.
    7120             :     case ISD::FADD:
    7121             :     case ISD::ADD:
    7122             :     case ISD::UMIN:
    7123             :     case ISD::UMAX:
    7124             :     case ISD::SMIN:
    7125             :     case ISD::SMAX:
    7126             :     case ISD::FMAXNUM:
    7127             :     case ISD::FMINNUM:
    7128             :       return DAG.getNode(Opc, SL, EltVT,
    7129             :                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7130             :                                      Vec.getOperand(0), Idx),
    7131             :                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    7132          58 :                                      Vec.getOperand(1), Idx));
    7133             :     }
    7134             :   }
    7135             : 
    7136      248637 :   if (!DCI.isBeforeLegalize())
    7137      213009 :     return SDValue();
    7138             : 
    7139       35628 :   unsigned VecSize = VecVT.getSizeInBits();
    7140       35628 :   unsigned EltSize = EltVT.getSizeInBits();
    7141             : 
    7142             :   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
    7143             :   // elements. This exposes more load reduction opportunities by replacing
    7144             :   // multiple small extract_vector_elements with a single 32-bit extract.
    7145       35628 :   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
    7146         657 :   if (EltSize <= 16 &&
    7147         649 :       EltVT.isByteSized() &&
    7148         412 :       VecSize > 32 &&
    7149       36382 :       VecSize % 32 == 0 &&
    7150             :       Idx) {
    7151         342 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
    7152             : 
    7153         684 :     unsigned BitIndex = Idx->getZExtValue() * EltSize;
    7154         342 :     unsigned EltIdx = BitIndex / 32;
    7155         342 :     unsigned LeftoverBitIdx = BitIndex % 32;
    7156             :     SDLoc SL(N);
    7157             : 
    7158         342 :     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
    7159         342 :     DCI.AddToWorklist(Cast.getNode());
    7160             : 
    7161             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
    7162         684 :                               DAG.getConstant(EltIdx, SL, MVT::i32));
    7163         342 :     DCI.AddToWorklist(Elt.getNode());
    7164             :     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
    7165         684 :                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
    7166         342 :     DCI.AddToWorklist(Srl.getNode());
    7167             : 
    7168         342 :     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
    7169         342 :     DCI.AddToWorklist(Trunc.getNode());
    7170         342 :     return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
    7171             :   }
    7172             : 
    7173       35286 :   return SDValue();
    7174             : }
    7175             : 
    7176        3322 : static bool convertBuildVectorCastElt(SelectionDAG &DAG,
    7177             :                                       SDValue &Lo, SDValue &Hi) {
    7178        3322 :   if (Hi.getOpcode() == ISD::BITCAST &&
    7179        3328 :       Hi.getOperand(0).getValueType() == MVT::f16 &&
    7180           4 :       (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
    7181           4 :     Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
    7182           4 :     Hi = Hi.getOperand(0);
    7183           2 :     return true;
    7184             :   }
    7185             : 
    7186             :   return false;
    7187             : }
    7188             : 
    7189      128341 : SDValue SITargetLowering::performBuildVectorCombine(
    7190             :   SDNode *N, DAGCombinerInfo &DCI) const {
    7191             :   SDLoc SL(N);
    7192             : 
    7193             :   if (!isTypeLegal(MVT::v2i16))
    7194       57674 :     return SDValue();
    7195       70667 :   SelectionDAG &DAG = DCI.DAG;
    7196      141334 :   EVT VT = N->getValueType(0);
    7197             : 
    7198             :   if (VT == MVT::v2i16) {
    7199        1662 :     SDValue Lo = N->getOperand(0);
    7200        1662 :     SDValue Hi = N->getOperand(1);
    7201             : 
    7202             :     // v2i16 build_vector (const|undef), (bitcast f16:$x)
    7203             :     // -> bitcast (v2f16 build_vector const|undef, $x
    7204        1662 :     if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
    7205           4 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
    7206           2 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    7207             :     }
    7208             : 
    7209        1660 :     if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
    7210           0 :       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
    7211           0 :       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
    7212             :     }
    7213             :   }
    7214             : 
    7215       70665 :   return SDValue();
    7216             : }
    7217             : 
    7218         200 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
    7219             :                                           const SDNode *N0,
    7220             :                                           const SDNode *N1) const {
    7221         400 :   EVT VT = N0->getValueType(0);
    7222             : 
    7223             :   // Only do this if we are not trying to support denormals. v_mad_f32 does not
    7224             :   // support denormals ever.
    7225         122 :   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
    7226          32 :       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
    7227             :     return ISD::FMAD;
    7228             : 
    7229          86 :   const TargetOptions &Options = DAG.getTarget().Options;
    7230         176 :   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    7231          51 :        (N0->getFlags().hasAllowContract() &&
    7232         136 :         N1->getFlags().hasAllowContract())) &&
    7233          44 :       isFMAFasterThanFMulAndFAdd(VT)) {
    7234             :     return ISD::FMA;
    7235             :   }
    7236             : 
    7237             :   return 0;
    7238             : }
    7239             : 
    7240          20 : static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
    7241             :                            EVT VT,
    7242             :                            SDValue N0, SDValue N1, SDValue N2,
    7243             :                            bool Signed) {
    7244          20 :   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
    7245          20 :   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
    7246          20 :   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
    7247          20 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
    7248             : }
    7249             : 
    7250      109066 : SDValue SITargetLowering::performAddCombine(SDNode *N,
    7251             :                                             DAGCombinerInfo &DCI) const {
    7252      109066 :   SelectionDAG &DAG = DCI.DAG;
    7253      218132 :   EVT VT = N->getValueType(0);
    7254             :   SDLoc SL(N);
    7255      109066 :   SDValue LHS = N->getOperand(0);
    7256      109066 :   SDValue RHS = N->getOperand(1);
    7257             : 
    7258      108810 :   if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
    7259        1783 :       && Subtarget->hasMad64_32() &&
    7260      109494 :       !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
    7261             :       VT.getScalarSizeInBits() <= 64) {
    7262          24 :     if (LHS.getOpcode() != ISD::MUL)
    7263             :       std::swap(LHS, RHS);
    7264             : 
    7265          24 :     SDValue MulLHS = LHS.getOperand(0);
    7266          24 :     SDValue MulRHS = LHS.getOperand(1);
    7267          24 :     SDValue AddRHS = RHS;
    7268             : 
    7269             :     // TODO: Maybe restrict if SGPR inputs.
    7270          38 :     if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
    7271          14 :         numBitsUnsigned(MulRHS, DAG) <= 32) {
    7272          13 :       MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
    7273          13 :       MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
    7274          13 :       AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
    7275          13 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
    7276             :     }
    7277             : 
    7278          18 :     if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
    7279           7 :       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
    7280           7 :       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
    7281           7 :       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
    7282           7 :       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
    7283             :     }
    7284             : 
    7285           4 :     return SDValue();
    7286             :   }
    7287             : 
    7288      131010 :   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
    7289       95824 :     return SDValue();
    7290             : 
    7291             :   // add x, zext (setcc) => addcarry x, 0, setcc
    7292             :   // add x, sext (setcc) => subcarry x, 0, setcc
    7293             :   unsigned Opc = LHS.getOpcode();
    7294       26436 :   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
    7295       13218 :       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
    7296             :     std::swap(RHS, LHS);
    7297             : 
    7298             :   Opc = RHS.getOpcode();
    7299       13218 :   switch (Opc) {
    7300             :   default: break;
    7301         103 :   case ISD::ZERO_EXTEND:
    7302             :   case ISD::SIGN_EXTEND:
    7303             :   case ISD::ANY_EXTEND: {
    7304         103 :     auto Cond = RHS.getOperand(0);
    7305             :     if (!isBoolSGPR(Cond))
    7306             :       break;
    7307          45 :     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
    7308          90 :     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
    7309          45 :     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
    7310          45 :     return DAG.getNode(Opc, SL, VTList, Args);
    7311             :   }
    7312           0 :   case ISD::ADDCARRY: {
    7313             :     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
    7314             :     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    7315           0 :     if (!C || C->getZExtValue() != 0) break;
    7316           0 :     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
    7317           0 :     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
    7318             :   }
    7319             :   }
    7320       13173 :   return SDValue();
    7321             : }
    7322             : 
    7323        3499 : SDValue SITargetLowering::performSubCombine(SDNode *N,
    7324             :                                             DAGCombinerInfo &DCI) const {
    7325        3499 :   SelectionDAG &DAG = DCI.DAG;
    7326        3499 :   EVT VT = N->getValueType(0);
    7327             : 
    7328             :   if (VT != MVT::i32)
    7329         582 :     return SDValue();
    7330             : 
    7331             :   SDLoc SL(N);
    7332        2917 :   SDValue LHS = N->getOperand(0);
    7333        2917 :   SDValue RHS = N->getOperand(1);
    7334             : 
    7335             :   unsigned Opc = LHS.getOpcode();
    7336        2917 :   if (Opc != ISD::SUBCARRY)
    7337             :     std::swap(RHS, LHS);
    7338             : 
    7339        2917 :   if (LHS.getOpcode() == ISD::SUBCARRY) {
    7340             :     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
    7341             :     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    7342           4 :     if (!C || C->getZExtValue() != 0)
    7343           0 :       return SDValue();
    7344           2 :     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
    7345           6 :     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
    7346             :   }
    7347        2915 :   return SDValue();
    7348             : }
    7349             : 
    7350         680 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
    7351             :   DAGCombinerInfo &DCI) const {
    7352             : 
    7353         680 :   if (N->getValueType(0) != MVT::i32)
    7354           0 :     return SDValue();
    7355             : 
    7356         680 :   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
    7357         514 :   if (!C || C->getZExtValue() != 0)
    7358         423 :     return SDValue();
    7359             : 
    7360         257 :   SelectionDAG &DAG = DCI.DAG;
    7361         257 :   SDValue LHS = N->getOperand(0);
    7362             : 
    7363             :   // addcarry (add x, y), 0, cc => addcarry x, y, cc
    7364             :   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
    7365             :   unsigned LHSOpc = LHS.getOpcode();
    7366         257 :   unsigned Opc = N->getOpcode();
    7367         514 :   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
    7368         257 :       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
    7369           2 :     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
    7370           6 :     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
    7371             :   }
    7372         255 :   return SDValue();
    7373             : }
    7374             : 
    7375        7924 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
    7376             :                                              DAGCombinerInfo &DCI) const {
    7377        7924 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    7378        5699 :     return SDValue();
    7379             : 
    7380        2225 :   SelectionDAG &DAG = DCI.DAG;
    7381        4450 :   EVT VT = N->getValueType(0);
    7382             : 
    7383             :   SDLoc SL(N);
    7384        2225 :   SDValue LHS = N->getOperand(0);
    7385        2225 :   SDValue RHS = N->getOperand(1);
    7386             : 
    7387             :   // These should really be instruction patterns, but writing patterns with
    7388             :   // source modiifiers is a pain.
    7389             : 
    7390             :   // fadd (fadd (a, a), b) -> mad 2.0, a, b
    7391        2225 :   if (LHS.getOpcode() == ISD::FADD) {
    7392         302 :     SDValue A = LHS.getOperand(0);
    7393             :     if (A == LHS.getOperand(1)) {
    7394         105 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    7395         105 :       if (FusedOp != 0) {
    7396          73 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7397          73 :         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
    7398             :       }
    7399             :     }
    7400             :   }
    7401             : 
    7402             :   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
    7403        2152 :   if (RHS.getOpcode() == ISD::FADD) {
    7404         130 :     SDValue A = RHS.getOperand(0);
    7405             :     if (A == RHS.getOperand(1)) {
    7406          30 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    7407          30 :       if (FusedOp != 0) {
    7408          20 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7409          20 :         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
    7410             :       }
    7411             :     }
    7412             :   }
    7413             : 
    7414        2132 :   return SDValue();
    7415             : }
    7416             : 
    7417        1561 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
    7418             :                                              DAGCombinerInfo &DCI) const {
    7419        1561 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    7420        1040 :     return SDValue();
    7421             : 
    7422         521 :   SelectionDAG &DAG = DCI.DAG;
    7423             :   SDLoc SL(N);
    7424        1042 :   EVT VT = N->getValueType(0);
    7425             :   assert(!VT.isVector());
    7426             : 
    7427             :   // Try to get the fneg to fold into the source modifier. This undoes generic
    7428             :   // DAG combines and folds them into the mad.
    7429             :   //
    7430             :   // Only do this if we are not trying to support denormals. v_mad_f32 does
    7431             :   // not support denormals ever.
    7432         521 :   SDValue LHS = N->getOperand(0);
    7433         521 :   SDValue RHS = N->getOperand(1);
    7434         521 :   if (LHS.getOpcode() == ISD::FADD) {
    7435             :     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
    7436          46 :     SDValue A = LHS.getOperand(0);
    7437             :     if (A == LHS.getOperand(1)) {
    7438          24 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    7439          24 :       if (FusedOp != 0){
    7440          17 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    7441          17 :         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    7442             : 
    7443          17 :         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
    7444             :       }
    7445             :     }
    7446             :   }
    7447             : 
    7448         504 :   if (RHS.getOpcode() == ISD::FADD) {
    7449             :     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
    7450             : 
    7451          50 :     SDValue A = RHS.getOperand(0);
    7452             :     if (A == RHS.getOperand(1)) {
    7453          41 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    7454          41 :       if (FusedOp != 0){
    7455          32 :         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
    7456          32 :         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
    7457             :       }
    7458             :     }
    7459             :   }
    7460             : 
    7461         472 :   return SDValue();
    7462             : }
    7463             : 
    7464        9956 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
    7465             :                                               DAGCombinerInfo &DCI) const {
    7466        9956 :   SelectionDAG &DAG = DCI.DAG;
    7467             :   SDLoc SL(N);
    7468             : 
    7469        9956 :   SDValue LHS = N->getOperand(0);
    7470        9956 :   SDValue RHS = N->getOperand(1);
    7471             :   EVT VT = LHS.getValueType();
    7472        9956 :   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
    7473             : 
    7474             :   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
    7475             :   if (!CRHS) {
    7476             :     CRHS = dyn_cast<ConstantSDNode>(LHS);
    7477             :     if (CRHS) {
    7478             :       std::swap(LHS, RHS);
    7479           0 :       CC = getSetCCSwappedOperands(CC);
    7480             :     }
    7481             :   }
    7482             : 
    7483        9956 :   if (CRHS) {
    7484        4293 :     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
    7485          16 :         isBoolSGPR(LHS.getOperand(0))) {
    7486             :       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
    7487             :       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
    7488             :       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
    7489             :       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
    7490           6 :       if ((CRHS->isAllOnesValue() &&
    7491           3 :            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
    7492           0 :           (CRHS->isNullValue() &&
    7493           0 :            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
    7494             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    7495           6 :                            DAG.getConstant(-1, SL, MVT::i1));
    7496           0 :       if ((CRHS->isAllOnesValue() &&
    7497           0 :            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
    7498           0 :           (CRHS->isNullValue() &&
    7499           0 :            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
    7500           0 :         return LHS.getOperand(0);
    7501             :     }
    7502             : 
    7503        5463 :     uint64_t CRHSVal = CRHS->getZExtValue();
    7504        9358 :     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
    7505             :         LHS.getOpcode() == ISD::SELECT &&
    7506             :         isa<ConstantSDNode>(LHS.getOperand(1)) &&
    7507         162 :         isa<ConstantSDNode>(LHS.getOperand(2)) &&
    7508        5463 :         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
    7509         162 :         isBoolSGPR(LHS.getOperand(0))) {
    7510             :       // Given CT != FT:
    7511             :       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
    7512             :       // setcc (select cc, CT, CF), CF, ne => cc
    7513             :       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
    7514             :       // setcc (select cc, CT, CF), CT, eq => cc
    7515             :       uint64_t CT = LHS.getConstantOperandVal(1);
    7516             :       uint64_t CF = LHS.getConstantOperandVal(2);
    7517             : 
    7518         167 :       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
    7519           5 :           (CT == CRHSVal && CC == ISD::SETNE))
    7520             :         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    7521         316 :                            DAG.getConstant(-1, SL, MVT::i1));
    7522           7 :       if ((CF == CRHSVal && CC == ISD::SETNE) ||
    7523           3 :           (CT == CRHSVal && CC == ISD::SETEQ))
    7524           2 :         return LHS.getOperand(0);
    7525             :     }
    7526             :   }
    7527             : 
    7528        8039 :   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
    7529             :                                            VT != MVT::f16))
    7530        3274 :     return SDValue();
    7531             : 
    7532             :   // Match isinf pattern
    7533             :   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
    7534        6659 :   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
    7535             :     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    7536             :     if (!CRHS)
    7537           0 :       return SDValue();
    7538             : 
    7539           2 :     const APFloat &APF = CRHS->getValueAPF();
    7540           4 :     if (APF.isInfinity() && !APF.isNegative()) {
    7541             :       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
    7542             :       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
    7543           4 :                          DAG.getConstant(Mask, SL, MVT::i32));
    7544             :     }
    7545             :   }
    7546             : 
    7547        6517 :   return SDValue();
    7548             : }
    7549             : 
    7550         356 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
    7551             :                                                      DAGCombinerInfo &DCI) const {
    7552         356 :   SelectionDAG &DAG = DCI.DAG;
    7553             :   SDLoc SL(N);
    7554         712 :   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
    7555             : 
    7556         356 :   SDValue Src = N->getOperand(0);
    7557         356 :   SDValue Srl = N->getOperand(0);
    7558         356 :   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
    7559          47 :     Srl = Srl.getOperand(0);
    7560             : 
    7561             :   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
    7562         356 :   if (Srl.getOpcode() == ISD::SRL) {
    7563             :     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
    7564             :     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
    7565             :     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
    7566             : 
    7567             :     if (const ConstantSDNode *C =
    7568             :         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
    7569         118 :       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
    7570          59 :                                EVT(MVT::i32));
    7571             : 
    7572         118 :       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
    7573          59 :       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
    7574          59 :         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
    7575          59 :                            MVT::f32, Srl);
    7576             :       }
    7577             :     }
    7578             :   }
    7579             : 
    7580         297 :   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
    7581             : 
    7582         297 :   KnownBits Known;
    7583         297 :   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    7584         594 :                                         !DCI.isBeforeLegalizeOps());
    7585         297 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    7586         594 :   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
    7587         297 :       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
    7588          91 :     DCI.CommitTargetLoweringOpt(TLO);
    7589             :   }
    7590             : 
    7591         297 :   return SDValue();
    7592             : }
    7593             : 
    7594         327 : SDValue SITargetLowering::performClampCombine(SDNode *N,
    7595             :                                               DAGCombinerInfo &DCI) const {
    7596         327 :   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
    7597             :   if (!CSrc)
    7598         302 :     return SDValue();
    7599             : 
    7600          25 :   const APFloat &F = CSrc->getValueAPF();
    7601          25 :   APFloat Zero = APFloat::getZero(F.getSemantics());
    7602          25 :   APFloat::cmpResult Cmp0 = F.compare(Zero);
    7603          25 :   if (Cmp0 == APFloat::cmpLessThan ||
    7604          12 :       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
    7605          27 :     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
    7606             :   }
    7607             : 
    7608          16 :   APFloat One(F.getSemantics(), "1.0");
    7609          16 :   APFloat::cmpResult Cmp1 = F.compare(One);
    7610          16 :   if (Cmp1 == APFloat::cmpGreaterThan)
    7611           9 :     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
    7612             : 
    7613          13 :   return SDValue(CSrc, 0);
    7614             : }
    7615             : 
    7616             : 
    7617     1548362 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    7618             :                                             DAGCombinerInfo &DCI) const {
    7619     3096724 :   switch (N->getOpcode()) {
    7620      342357 :   default:
    7621      342357 :     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7622      109066 :   case ISD::ADD:
    7623      109066 :     return performAddCombine(N, DCI);
    7624        3499 :   case ISD::SUB:
    7625        3499 :     return performSubCombine(N, DCI);
    7626         680 :   case ISD::ADDCARRY:
    7627             :   case ISD::SUBCARRY:
    7628         680 :     return performAddCarrySubCarryCombine(N, DCI);
    7629        7924 :   case ISD::FADD:
    7630        7924 :     return performFAddCombine(N, DCI);
    7631        1561 :   case ISD::FSUB:
    7632        1561 :     return performFSubCombine(N, DCI);
    7633        9956 :   case ISD::SETCC:
    7634        9956 :     return performSetCCCombine(N, DCI);
    7635       10620 :   case ISD::FMAXNUM:
    7636             :   case ISD::FMINNUM:
    7637             :   case ISD::SMAX:
    7638             :   case ISD::SMIN:
    7639             :   case ISD::UMAX:
    7640             :   case ISD::UMIN:
    7641             :   case AMDGPUISD::FMIN_LEGACY:
    7642             :   case AMDGPUISD::FMAX_LEGACY: {
    7643       14282 :     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
    7644        3662 :         getTargetMachine().getOptLevel() > CodeGenOpt::None)
    7645        3662 :       return performMinMaxCombine(N, DCI);
    7646             :     break;
    7647             :   }
    7648             :   case ISD::LOAD: {
    7649      294158 :     if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
    7650          34 :       return Widended;
    7651             :     LLVM_FALLTHROUGH;
    7652             :   }
    7653             :   case ISD::STORE:
    7654             :   case ISD::ATOMIC_LOAD:
    7655             :   case ISD::ATOMIC_STORE:
    7656             :   case ISD::ATOMIC_CMP_SWAP:
    7657             :   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
    7658             :   case ISD::ATOMIC_SWAP:
    7659             :   case ISD::ATOMIC_LOAD_ADD:
    7660             :   case ISD::ATOMIC_LOAD_SUB:
    7661             :   case ISD::ATOMIC_LOAD_AND:
    7662             :   case ISD::ATOMIC_LOAD_OR:
    7663             :   case ISD::ATOMIC_LOAD_XOR:
    7664             :   case ISD::ATOMIC_LOAD_NAND:
    7665             :   case ISD::ATOMIC_LOAD_MIN:
    7666             :   case ISD::ATOMIC_LOAD_MAX:
    7667             :   case ISD::ATOMIC_LOAD_UMIN:
    7668             :   case ISD::ATOMIC_LOAD_UMAX:
    7669             :   case AMDGPUISD::ATOMIC_INC:
    7670             :   case AMDGPUISD::ATOMIC_DEC:
    7671             :   case AMDGPUISD::ATOMIC_LOAD_FADD:
    7672             :   case AMDGPUISD::ATOMIC_LOAD_FMIN:
    7673             :   case AMDGPUISD::ATOMIC_LOAD_FMAX:  // TODO: Target mem intrinsics.
    7674      616631 :     if (DCI.isBeforeLegalize())
    7675             :       break;
    7676      331174 :     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
    7677       30067 :   case ISD::AND:
    7678       30067 :     return performAndCombine(N, DCI);
    7679       15354 :   case ISD::OR:
    7680       15354 :     return performOrCombine(N, DCI);
    7681        1870 :   case ISD::XOR:
    7682        1870 :     return performXorCombine(N, DCI);
    7683       19383 :   case ISD::ZERO_EXTEND:
    7684       19383 :     return performZeroExtendCombine(N, DCI);
    7685          81 :   case AMDGPUISD::FP_CLASS:
    7686          81 :     return performClassCombine(N, DCI);
    7687         426 :   case ISD::FCANONICALIZE:
    7688         426 :     return performFCanonicalizeCombine(N, DCI);
    7689         728 :   case AMDGPUISD::RCP:
    7690         728 :     return performRcpCombine(N, DCI);
    7691         610 :   case AMDGPUISD::FRACT:
    7692             :   case AMDGPUISD::RSQ:
    7693             :   case AMDGPUISD::RCP_LEGACY:
    7694             :   case AMDGPUISD::RSQ_LEGACY:
    7695             :   case AMDGPUISD::RCP_IFLAG:
    7696             :   case AMDGPUISD::RSQ_CLAMP:
    7697             :   case AMDGPUISD::LDEXP: {
    7698         610 :     SDValue Src = N->getOperand(0);
    7699         610 :     if (Src.isUndef())
    7700          10 :       return Src;
    7701             :     break;
    7702             :   }
    7703        1680 :   case ISD::SINT_TO_FP:
    7704             :   case ISD::UINT_TO_FP:
    7705        1680 :     return performUCharToFloatCombine(N, DCI);
    7706         356 :   case AMDGPUISD::CVT_F32_UBYTE0:
    7707             :   case AMDGPUISD::CVT_F32_UBYTE1:
    7708             :   case AMDGPUISD::CVT_F32_UBYTE2:
    7709             :   case AMDGPUISD::CVT_F32_UBYTE3:
    7710         356 :     return performCvtF32UByteNCombine(N, DCI);
    7711         113 :   case AMDGPUISD::FMED3:
    7712         113 :     return performFMed3Combine(N, DCI);
    7713         139 :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    7714         139 :     return performCvtPkRTZCombine(N, DCI);
    7715         327 :   case AMDGPUISD::CLAMP:
    7716         327 :     return performClampCombine(N, DCI);
    7717        1981 :   case ISD::SCALAR_TO_VECTOR: {
    7718        1981 :     SelectionDAG &DAG = DCI.DAG;
    7719        3962 :     EVT VT = N->getValueType(0);
    7720             : 
    7721             :     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
    7722             :     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
    7723             :       SDLoc SL(N);
    7724         118 :       SDValue Src = N->getOperand(0);
    7725             :       EVT EltVT = Src.getValueType();
    7726             :       if (EltVT == MVT::f16)
    7727          35 :         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
    7728             : 
    7729         118 :       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
    7730         118 :       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
    7731             :     }
    7732             : 
    7733        1863 :     break;
    7734             :   }
    7735      244578 :   case ISD::EXTRACT_VECTOR_ELT:
    7736      244578 :     return performExtractVectorEltCombine(N, DCI);
    7737      128341 :   case ISD::BUILD_VECTOR:
    7738      128341 :     return performBuildVectorCombine(N, DCI);
    7739             :   }
    7740      294878 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    7741             : }
    7742             : 
    7743             : /// Helper function for adjustWritemask
    7744             : static unsigned SubIdx2Lane(unsigned Idx) {
    7745             :   switch (Idx) {
    7746             :   default: return 0;
    7747             :   case AMDGPU::sub0: return 0;
    7748             :   case AMDGPU::sub1: return 1;
    7749             :   case AMDGPU::sub2: return 2;
    7750             :   case AMDGPU::sub3: return 3;
    7751             :   }
    7752             : }
    7753             : 
    7754             : /// Adjust the writemask of MIMG instructions
    7755         675 : SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    7756             :                                           SelectionDAG &DAG) const {
    7757         675 :   unsigned Opcode = Node->getMachineOpcode();
    7758             : 
    7759             :   // Subtract 1 because the vdata output is not a MachineSDNode operand.
    7760         675 :   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
    7761        1313 :   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
    7762             :     return Node; // not implemented for D16
    7763             : 
    7764         651 :   SDNode *Users[4] = { nullptr };
    7765             :   unsigned Lane = 0;
    7766         651 :   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
    7767         651 :   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    7768             :   unsigned NewDmask = 0;
    7769         651 :   bool HasChain = Node->getNumValues() > 1;
    7770             : 
    7771         651 :   if (OldDmask == 0) {
    7772             :     // These are folded out, but on the chance it happens don't assert.
    7773             :     return Node;
    7774             :   }
    7775             : 
    7776             :   // Try to figure out the used register components
    7777         651 :   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    7778        2384 :        I != E; ++I) {
    7779             : 
    7780             :     // Don't look at users of the chain.
    7781        1892 :     if (I.getUse().getResNo() != 0)
    7782         111 :       continue;
    7783             : 
    7784             :     // Abort if we can't understand the usage
    7785        1781 :     if (!I->isMachineOpcode() ||
    7786             :         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    7787             :       return Node;
    7788             : 
    7789             :     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
    7790             :     // Note that subregs are packed, i.e. Lane==0 is the first bit set
    7791             :     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
    7792             :     // set, etc.
    7793        1624 :     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    7794             : 
    7795             :     // Set which texture component corresponds to the lane.
    7796             :     unsigned Comp;
    7797        9242 :     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
    7798        3809 :       Comp = countTrailingZeros(Dmask);
    7799        3809 :       Dmask &= ~(1 << Comp);
    7800             :     }
    7801             : 
    7802             :     // Abort if we have more than one user per component
    7803        1624 :     if (Users[Lane])
    7804             :       return Node;
    7805             : 
    7806        1622 :     Users[Lane] = *I;
    7807        1622 :     NewDmask |= 1 << Comp;
    7808             :   }
    7809             : 
    7810             :   // Abort if there's no change
    7811         492 :   if (NewDmask == OldDmask)
    7812             :     return Node;
    7813             : 
    7814             :   unsigned BitsSet = countPopulation(NewDmask);
    7815             : 
    7816          95 :   int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
    7817             :   assert(NewOpcode != -1 &&
    7818             :          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
    7819             :          "failed to find equivalent MIMG op");
    7820             : 
    7821             :   // Adjust the writemask in the node
    7822             :   SmallVector<SDValue, 12> Ops;
    7823          95 :   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
    7824         380 :   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
    7825         285 :   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
    7826             : 
    7827         190 :   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
    7828             : 
    7829             :   MVT ResultVT = BitsSet == 1 ?
    7830          95 :     SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
    7831             :   SDVTList NewVTList = HasChain ?
    7832         190 :     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
    7833             : 
    7834             : 
    7835         285 :   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
    7836          95 :                                               NewVTList, Ops);
    7837             : 
    7838          95 :   if (HasChain) {
    7839             :     // Update chain.
    7840          92 :     NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
    7841          92 :     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
    7842             :   }
    7843             : 
    7844          95 :   if (BitsSet == 1) {
    7845             :     assert(Node->hasNUsesOfValue(1, 0));
    7846         114 :     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
    7847         171 :                                       SDLoc(Node), Users[Lane]->getValueType(0),
    7848          57 :                                       SDValue(NewNode, 0));
    7849          57 :     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    7850          57 :     return nullptr;
    7851             :   }
    7852             : 
    7853             :   // Update the users of the node with the new indices
    7854         342 :   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    7855         152 :     SDNode *User = Users[i];
    7856         152 :     if (!User)
    7857          56 :       continue;
    7858             : 
    7859         288 :     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
    7860          96 :     DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
    7861             : 
    7862          96 :     switch (Idx) {
    7863             :     default: break;
    7864          38 :     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    7865          38 :     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    7866          20 :     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    7867             :     }
    7868             :   }
    7869             : 
    7870          38 :   DAG.RemoveDeadNode(Node);
    7871          38 :   return nullptr;
    7872             : }
    7873             : 
    7874             : static bool isFrameIndexOp(SDValue Op) {
    7875      409988 :   if (Op.getOpcode() == ISD::AssertZext)
    7876          68 :     Op = Op.getOperand(0);
    7877             : 
    7878             :   return isa<FrameIndexSDNode>(Op);
    7879             : }
    7880             : 
    7881             : /// Legalize target independent instructions (e.g. INSERT_SUBREG)
    7882             : /// with frame index operands.
    7883             : /// LLVM assumes that inputs are to these instructions are registers.
    7884       69252 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
    7885             :                                                         SelectionDAG &DAG) const {
    7886       69252 :   if (Node->getOpcode() == ISD::CopyToReg) {
    7887       12783 :     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
    7888       12783 :     SDValue SrcVal = Node->getOperand(2);
    7889             : 
    7890             :     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
    7891             :     // to try understanding copies to physical registers.
    7892         193 :     if (SrcVal.getValueType() == MVT::i1 &&
    7893         193 :         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
    7894             :       SDLoc SL(Node);
    7895           8 :       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7896             :       SDValue VReg = DAG.getRegister(
    7897           8 :         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
    7898             : 
    7899             :       SDNode *Glued = Node->getGluedNode();
    7900             :       SDValue ToVReg
    7901           8 :         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
    7902          12 :                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
    7903             :       SDValue ToResultReg
    7904             :         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
    7905          16 :                            VReg, ToVReg.getValue(1));
    7906           8 :       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
    7907           8 :       DAG.RemoveDeadNode(Node);
    7908             :       return ToResultReg.getNode();
    7909             :     }
    7910             :   }
    7911             : 
    7912             :   SmallVector<SDValue, 8> Ops;
    7913     1368452 :   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
    7914     1229942 :     if (!isFrameIndexOp(Node->getOperand(i))) {
    7915      409966 :       Ops.push_back(Node->getOperand(i));
    7916      409966 :       continue;
    7917             :     }
    7918             : 
    7919             :     SDLoc DL(Node);
    7920          66 :     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
    7921             :                                      Node->getOperand(i).getValueType(),
    7922          22 :                                      Node->getOperand(i)), 0));
    7923             :   }
    7924             : 
    7925       69244 :   return DAG.UpdateNodeOperands(Node, Ops);
    7926             : }
    7927             : 
    7928             : /// Fold the instructions after selecting them.
    7929             : /// Returns null if users were already updated.
    7930      399647 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
    7931             :                                           SelectionDAG &DAG) const {
    7932      399647 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    7933      399647 :   unsigned Opcode = Node->getMachineOpcode();
    7934             : 
    7935      801766 :   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
    7936             :       !TII->isGather4(Opcode)) {
    7937         675 :     return adjustWritemask(Node, DAG);
    7938             :   }
    7939             : 
    7940      797944 :   if (Opcode == AMDGPU::INSERT_SUBREG ||
    7941      398972 :       Opcode == AMDGPU::REG_SEQUENCE) {
    7942       56469 :     legalizeTargetIndependentNode(Node, DAG);
    7943       56469 :     return Node;
    7944             :   }
    7945             : 
    7946      342503 :   switch (Opcode) {
    7947         273 :   case AMDGPU::V_DIV_SCALE_F32:
    7948             :   case AMDGPU::V_DIV_SCALE_F64: {
    7949             :     // Satisfy the operand register constraint when one of the inputs is
    7950             :     // undefined. Ordinarily each undef value will have its own implicit_def of
    7951             :     // a vreg, so force these to use a single register.
    7952         273 :     SDValue Src0 = Node->getOperand(0);
    7953         273 :     SDValue Src1 = Node->getOperand(1);
    7954         273 :     SDValue Src2 = Node->getOperand(2);
    7955             : 
    7956         270 :     if ((Src0.isMachineOpcode() &&
    7957         273 :          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
    7958             :         (Src0 == Src1 || Src0 == Src2))
    7959             :       break;
    7960             : 
    7961             :     MVT VT = Src0.getValueType().getSimpleVT();
    7962           6 :     const TargetRegisterClass *RC = getRegClassFor(VT);
    7963             : 
    7964           6 :     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    7965           6 :     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
    7966             : 
    7967          12 :     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
    7968          18 :                                       UndefReg, Src0, SDValue());
    7969             : 
    7970             :     // src0 must be the same register as src1 or src2, even if the value is
    7971             :     // undefined, so make sure we don't violate this constraint.
    7972           6 :     if (Src0.isMachineOpcode() &&
    7973             :         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
    7974           3 :       if (Src1.isMachineOpcode() &&
    7975             :           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7976             :         Src0 = Src1;
    7977           3 :       else if (Src2.isMachineOpcode() &&
    7978             :                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    7979             :         Src0 = Src2;
    7980             :       else {
    7981             :         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
    7982           1 :         Src0 = UndefReg;
    7983             :         Src1 = UndefReg;
    7984             :       }
    7985             :     } else
    7986             :       break;
    7987             : 
    7988           6 :     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
    7989           6 :     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
    7990           0 :       Ops.push_back(Node->getOperand(I));
    7991             : 
    7992           3 :     Ops.push_back(ImpDef.getValue(1));
    7993           9 :     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    7994             :   }
    7995             :   default:
    7996             :     break;
    7997             :   }
    7998             : 
    7999      342500 :   return Node;
    8000             : }
    8001             : 
    8002             : /// Assign the register class depending on the number of
    8003             : /// bits set in the writemask
    8004       32437 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
    8005             :                                                      SDNode *Node) const {
    8006       32437 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8007             : 
    8008       32437 :   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    8009             : 
    8010       97311 :   if (TII->isVOP3(MI.getOpcode())) {
    8011             :     // Make sure constant bus requirements are respected.
    8012       30107 :     TII->legalizeOperandsVOP3(MRI, MI);
    8013       30107 :     return;
    8014             :   }
    8015             : 
    8016             :   // Replace unused atomics with the no return version.
    8017        2330 :   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
    8018        2330 :   if (NoRetAtomicOp != -1) {
    8019        1840 :     if (!Node->hasAnyUseOfValue(0)) {
    8020         946 :       MI.setDesc(TII->get(NoRetAtomicOp));
    8021         946 :       MI.RemoveOperand(0);
    8022         946 :       return;
    8023             :     }
    8024             : 
    8025             :     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
    8026             :     // instruction, because the return type of these instructions is a vec2 of
    8027             :     // the memory type, so it can be tied to the input operand.
    8028             :     // This means these instructions always have a use, so we need to add a
    8029             :     // special case to check if the atomic has only one extract_subreg use,
    8030             :     // which itself has no uses.
    8031        1786 :     if ((Node->hasNUsesOfValue(1, 0) &&
    8032        2652 :          Node->use_begin()->isMachineOpcode() &&
    8033         906 :          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
    8034          12 :          !Node->use_begin()->hasAnyUseOfValue(0))) {
    8035           0 :       unsigned Def = MI.getOperand(0).getReg();
    8036             : 
    8037             :       // Change this into a noret atomic.
    8038           0 :       MI.setDesc(TII->get(NoRetAtomicOp));
    8039           0 :       MI.RemoveOperand(0);
    8040             : 
    8041             :       // If we only remove the def operand from the atomic instruction, the
    8042             :       // extract_subreg will be left with a use of a vreg without a def.
    8043             :       // So we need to insert an implicit_def to avoid machine verifier
    8044             :       // errors.
    8045           0 :       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
    8046           0 :               TII->get(AMDGPU::IMPLICIT_DEF), Def);
    8047             :     }
    8048             :     return;
    8049             :   }
    8050             : }
    8051             : 
    8052       42416 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
    8053             :                               uint64_t Val) {
    8054       42416 :   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
    8055       42416 :   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
    8056             : }
    8057             : 
    8058        4144 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
    8059             :                                                 const SDLoc &DL,
    8060             :                                                 SDValue Ptr) const {
    8061        4144 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    8062             : 
    8063             :   // Build the half of the subregister with the constants before building the
    8064             :   // full 128-bit register. If we are building multiple resource descriptors,
    8065             :   // this will allow CSEing of the 2-component register.
    8066             :   const SDValue Ops0[] = {
    8067             :     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
    8068             :     buildSMovImm32(DAG, DL, 0),
    8069             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    8070        4144 :     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
    8071             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
    8072       16576 :   };
    8073             : 
    8074        4144 :   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
    8075             :                                                 MVT::v2i32, Ops0), 0);
    8076             : 
    8077             :   // Combine the constants and the pointer.
    8078             :   const SDValue Ops1[] = {
    8079             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    8080             :     Ptr,
    8081             :     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
    8082             :     SubRegHi,
    8083             :     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
    8084       12432 :   };
    8085             : 
    8086        4144 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
    8087             : }
    8088             : 
    8089             : /// Return a resource descriptor with the 'Add TID' bit enabled
    8090             : ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
    8091             : ///        of the resource descriptor) to create an offset, which is added to
    8092             : ///        the resource pointer.
    8093       17064 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
    8094             :                                            SDValue Ptr, uint32_t RsrcDword1,
    8095             :                                            uint64_t RsrcDword2And3) const {
    8096       17064 :   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
    8097       17064 :   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
    8098       17064 :   if (RsrcDword1) {
    8099           0 :     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
    8100             :                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
    8101             :                     0);
    8102             :   }
    8103             : 
    8104             :   SDValue DataLo = buildSMovImm32(DAG, DL,
    8105       17064 :                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
    8106       17064 :   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
    8107             : 
    8108             :   const SDValue Ops[] = {
    8109             :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    8110             :     PtrLo,
    8111             :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    8112             :     PtrHi,
    8113             :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
    8114             :     DataLo,
    8115             :     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
    8116             :     DataHi,
    8117             :     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
    8118       85320 :   };
    8119             : 
    8120       17064 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
    8121             : }
    8122             : 
    8123             : //===----------------------------------------------------------------------===//
    8124             : //                         SI Inline Assembly Support
    8125             : //===----------------------------------------------------------------------===//
    8126             : 
    8127             : std::pair<unsigned, const TargetRegisterClass *>
    8128        2107 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    8129             :                                                StringRef Constraint,
    8130             :                                                MVT VT) const {
    8131             :   const TargetRegisterClass *RC = nullptr;
    8132        2107 :   if (Constraint.size() == 1) {
    8133        1282 :     switch (Constraint[0]) {
    8134           0 :     default:
    8135           0 :       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    8136         358 :     case 's':
    8137             :     case 'r':
    8138         358 :       switch (VT.getSizeInBits()) {
    8139           6 :       default:
    8140           6 :         return std::make_pair(0U, nullptr);
    8141             :       case 32:
    8142             :       case 16:
    8143             :         RC = &AMDGPU::SReg_32_XM0RegClass;
    8144             :         break;
    8145          73 :       case 64:
    8146             :         RC = &AMDGPU::SGPR_64RegClass;
    8147          73 :         break;
    8148          18 :       case 128:
    8149             :         RC = &AMDGPU::SReg_128RegClass;
    8150          18 :         break;
    8151          48 :       case 256:
    8152             :         RC = &AMDGPU::SReg_256RegClass;
    8153          48 :         break;
    8154          32 :       case 512:
    8155             :         RC = &AMDGPU::SReg_512RegClass;
    8156          32 :         break;
    8157             :       }
    8158             :       break;
    8159         283 :     case 'v':
    8160         283 :       switch (VT.getSizeInBits()) {
    8161           6 :       default:
    8162           6 :         return std::make_pair(0U, nullptr);
    8163             :       case 32:
    8164             :       case 16:
    8165             :         RC = &AMDGPU::VGPR_32RegClass;
    8166             :         break;
    8167          42 :       case 64:
    8168             :         RC = &AMDGPU::VReg_64RegClass;
    8169          42 :         break;
    8170           0 :       case 96:
    8171             :         RC = &AMDGPU::VReg_96RegClass;
    8172           0 :         break;
    8173          23 :       case 128:
    8174             :         RC = &AMDGPU::VReg_128RegClass;
    8175          23 :         break;
    8176           0 :       case 256:
    8177             :         RC = &AMDGPU::VReg_256RegClass;
    8178           0 :         break;
    8179           0 :       case 512:
    8180             :         RC = &AMDGPU::VReg_512RegClass;
    8181           0 :         break;
    8182             :       }
    8183             :       break;
    8184             :     }
    8185             :     // We actually support i128, i16 and f16 as inline parameters
    8186             :     // even if they are not reported as legal
    8187          38 :     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
    8188          26 :                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
    8189             :       return std::make_pair(0U, RC);
    8190             :   }
    8191             : 
    8192        1484 :   if (Constraint.size() > 1) {
    8193        2932 :     if (Constraint[1] == 'v') {
    8194             :       RC = &AMDGPU::VGPR_32RegClass;
    8195         722 :     } else if (Constraint[1] == 's') {
    8196             :       RC = &AMDGPU::SGPR_32RegClass;
    8197             :     }
    8198             : 
    8199        1466 :     if (RC) {
    8200             :       uint32_t Idx;
    8201        2600 :       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
    8202        1300 :       if (!Failed && Idx < RC->getNumRegs())
    8203             :         return std::make_pair(RC->getRegister(Idx), RC);
    8204             :     }
    8205             :   }
    8206        1484 :   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    8207             : }
    8208             : 
    8209             : SITargetLowering::ConstraintType
    8210        7147 : SITargetLowering::getConstraintType(StringRef Constraint) const {
    8211        7147 :   if (Constraint.size() == 1) {
    8212        5390 :     switch (Constraint[0]) {
    8213             :     default: break;
    8214             :     case 's':
    8215             :     case 'v':
    8216             :       return C_RegisterClass;
    8217             :     }
    8218             :   }
    8219        4636 :   return TargetLowering::getConstraintType(Constraint);
    8220             : }
    8221             : 
    8222             : // Figure out which registers should be reserved for stack access. Only after
    8223             : // the function is legalized do we know all of the non-spill stack objects or if
    8224             : // calls are present.
    8225       17887 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    8226       17887 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    8227       17887 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    8228       17887 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
    8229       17887 :   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    8230             : 
    8231       17887 :   if (Info->isEntryFunction()) {
    8232             :     // Callable functions have fixed registers used for stack access.
    8233       16470 :     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
    8234             :   }
    8235             : 
    8236             :   // We have to assume the SP is needed in case there are calls in the function
    8237             :   // during lowering. Calls are only detected after the function is
    8238             :   // lowered. We're about to reserve registers, so don't bother using it if we
    8239             :   // aren't really going to use it.
    8240       34357 :   bool NeedSP = !Info->isEntryFunction() ||
    8241       34355 :     MFI.hasVarSizedObjects() ||
    8242       16468 :     MFI.hasCalls();
    8243             : 
    8244             :   if (NeedSP) {
    8245        1764 :     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
    8246             :     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
    8247             : 
    8248             :     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
    8249             :     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
    8250             :                                Info->getStackPtrOffsetReg()));
    8251        1764 :     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
    8252             :   }
    8253             : 
    8254       17887 :   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
    8255       17887 :   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
    8256       17887 :   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
    8257             :                      Info->getScratchWaveOffsetReg());
    8258             : 
    8259       17887 :   Info->limitOccupancy(MF);
    8260             : 
    8261       17887 :   TargetLoweringBase::finalizeLowering(MF);
    8262       17887 : }
    8263             : 
    8264      444055 : void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
    8265             :                                                      KnownBits &Known,
    8266             :                                                      const APInt &DemandedElts,
    8267             :                                                      const SelectionDAG &DAG,
    8268             :                                                      unsigned Depth) const {
    8269      444055 :   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
    8270             :                                                 DAG, Depth);
    8271             : 
    8272      444055 :   if (getSubtarget()->enableHugePrivateBuffer())
    8273             :     return;
    8274             : 
    8275             :   // Technically it may be possible to have a dispatch with a single workitem
    8276             :   // that uses the full private memory size, but that's not really useful. We
    8277             :   // can't use vaddr in MUBUF instructions if we don't know the address
    8278             :   // calculation won't overflow, so assume the sign bit is never set.
    8279      444047 :   Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
    8280             : }
    8281             : 
    8282     4704527 : bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
    8283             :   FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
    8284             : {
    8285     9409054 :   switch (N->getOpcode()) {
    8286      420137 :     case ISD::Register:
    8287             :     case ISD::CopyFromReg:
    8288             :     {
    8289             :       const RegisterSDNode *R = nullptr;
    8290      420137 :       if (N->getOpcode() == ISD::Register) {
    8291             :         R = dyn_cast<RegisterSDNode>(N);
    8292             :       }
    8293             :       else {
    8294      175299 :         R = dyn_cast<RegisterSDNode>(N->getOperand(1));
    8295             :       }
    8296      420137 :       if (R)
    8297             :       {
    8298      420137 :         const MachineFunction * MF = FLI->MF;
    8299      420137 :         const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
    8300      420137 :         const MachineRegisterInfo &MRI = MF->getRegInfo();
    8301             :         const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
    8302      420137 :         unsigned Reg = R->getReg();
    8303      420137 :         if (TRI.isPhysicalRegister(Reg))
    8304       53597 :           return TRI.isVGPR(MRI, Reg);
    8305             : 
    8306      366540 :         if (MRI.isLiveIn(Reg)) {
    8307             :           // workitem.id.x workitem.id.y workitem.id.z
    8308             :           // Any VGPR formal argument is also considered divergent
    8309      278022 :           if (TRI.isVGPR(MRI, Reg))
    8310             :               return true;
    8311             :           // Formal arguments of non-entry functions
    8312             :           // are conservatively considered divergent
    8313      394468 :           else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
    8314             :             return true;
    8315             :         }
    8316      270182 :         return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
    8317           0 :       }
    8318             :     }
    8319             :     break;
    8320             :     case ISD::LOAD: {
    8321             :       const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
    8322      598766 :       if (L->getMemOperand()->getAddrSpace() ==
    8323      299383 :           Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
    8324      288686 :         return true;
    8325             :     } break;
    8326             :     case ISD::CALLSEQ_END:
    8327             :     return true;
    8328             :     break;
    8329       64915 :     case ISD::INTRINSIC_WO_CHAIN:
    8330             :     {
    8331             : 
    8332             :     }
    8333       64915 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    8334      194745 :       cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
    8335        4771 :     case ISD::INTRINSIC_W_CHAIN:
    8336        4771 :       return AMDGPU::isIntrinsicSourceOfDivergence(
    8337       14313 :       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
    8338             :     // In some cases intrinsics that are a source of divergence have been
    8339             :     // lowered to AMDGPUISD so we also need to check those too.
    8340             :     case AMDGPUISD::INTERP_MOV:
    8341             :     case AMDGPUISD::INTERP_P1:
    8342             :     case AMDGPUISD::INTERP_P2:
    8343             :       return true;
    8344             :   }
    8345             :   return false;
    8346      299229 : }

Generated by: LCOV version 1.13