LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 3214 3400 94.5 %
Date: 2017-09-14 15:23:50 Functions: 136 137 99.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Custom DAG lowering for SI
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifdef _MSC_VER
      16             : // Provide M_PI.
      17             : #define _USE_MATH_DEFINES
      18             : #endif
      19             : 
      20             : #include "SIISelLowering.h"
      21             : #include "AMDGPU.h"
      22             : #include "AMDGPUIntrinsicInfo.h"
      23             : #include "AMDGPUSubtarget.h"
      24             : #include "AMDGPUTargetMachine.h"
      25             : #include "SIDefines.h"
      26             : #include "SIInstrInfo.h"
      27             : #include "SIMachineFunctionInfo.h"
      28             : #include "SIRegisterInfo.h"
      29             : #include "Utils/AMDGPUBaseInfo.h"
      30             : #include "llvm/ADT/APFloat.h"
      31             : #include "llvm/ADT/APInt.h"
      32             : #include "llvm/ADT/ArrayRef.h"
      33             : #include "llvm/ADT/BitVector.h"
      34             : #include "llvm/ADT/SmallVector.h"
      35             : #include "llvm/ADT/Statistic.h"
      36             : #include "llvm/ADT/StringRef.h"
      37             : #include "llvm/ADT/StringSwitch.h"
      38             : #include "llvm/ADT/Twine.h"
      39             : #include "llvm/CodeGen/Analysis.h"
      40             : #include "llvm/CodeGen/CallingConvLower.h"
      41             : #include "llvm/CodeGen/DAGCombine.h"
      42             : #include "llvm/CodeGen/ISDOpcodes.h"
      43             : #include "llvm/CodeGen/MachineBasicBlock.h"
      44             : #include "llvm/CodeGen/MachineFrameInfo.h"
      45             : #include "llvm/CodeGen/MachineFunction.h"
      46             : #include "llvm/CodeGen/MachineInstr.h"
      47             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      48             : #include "llvm/CodeGen/MachineMemOperand.h"
      49             : #include "llvm/CodeGen/MachineModuleInfo.h"
      50             : #include "llvm/CodeGen/MachineOperand.h"
      51             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      52             : #include "llvm/CodeGen/MachineValueType.h"
      53             : #include "llvm/CodeGen/SelectionDAG.h"
      54             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      55             : #include "llvm/CodeGen/ValueTypes.h"
      56             : #include "llvm/IR/Constants.h"
      57             : #include "llvm/IR/DataLayout.h"
      58             : #include "llvm/IR/DebugLoc.h"
      59             : #include "llvm/IR/DerivedTypes.h"
      60             : #include "llvm/IR/DiagnosticInfo.h"
      61             : #include "llvm/IR/Function.h"
      62             : #include "llvm/IR/GlobalValue.h"
      63             : #include "llvm/IR/InstrTypes.h"
      64             : #include "llvm/IR/Instruction.h"
      65             : #include "llvm/IR/Instructions.h"
      66             : #include "llvm/IR/IntrinsicInst.h"
      67             : #include "llvm/IR/Type.h"
      68             : #include "llvm/Support/Casting.h"
      69             : #include "llvm/Support/CodeGen.h"
      70             : #include "llvm/Support/CommandLine.h"
      71             : #include "llvm/Support/Compiler.h"
      72             : #include "llvm/Support/ErrorHandling.h"
      73             : #include "llvm/Support/KnownBits.h"
      74             : #include "llvm/Support/MathExtras.h"
      75             : #include "llvm/Target/TargetCallingConv.h"
      76             : #include "llvm/Target/TargetOptions.h"
      77             : #include "llvm/Target/TargetRegisterInfo.h"
      78             : #include <cassert>
      79             : #include <cmath>
      80             : #include <cstdint>
      81             : #include <iterator>
      82             : #include <tuple>
      83             : #include <utility>
      84             : #include <vector>
      85             : 
      86             : using namespace llvm;
      87             : 
      88             : #define DEBUG_TYPE "si-lower"
      89             : 
      90             : STATISTIC(NumTailCalls, "Number of tail calls");
      91             : 
      92       72306 : static cl::opt<bool> EnableVGPRIndexMode(
      93             :   "amdgpu-vgpr-index-mode",
      94      216918 :   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
      95      289224 :   cl::init(false));
      96             : 
      97             : static unsigned findFirstFreeSGPR(CCState &CCInfo) {
      98          88 :   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
      99         201 :   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
     100         402 :     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
     101             :       return AMDGPU::SGPR0 + Reg;
     102             :     }
     103             :   }
     104           0 :   llvm_unreachable("Cannot allocate sgpr");
     105             : }
     106             : 
     107        1796 : SITargetLowering::SITargetLowering(const TargetMachine &TM,
     108        1796 :                                    const SISubtarget &STI)
     109        1796 :     : AMDGPUTargetLowering(TM, STI) {
     110        3592 :   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
     111        3592 :   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
     112             : 
     113        3592 :   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
     114        3592 :   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
     115             : 
     116        3592 :   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
     117        3592 :   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
     118        3592 :   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
     119             : 
     120        3592 :   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
     121        3592 :   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
     122             : 
     123        3592 :   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
     124        3592 :   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     125             : 
     126        3592 :   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
     127        3592 :   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     128             : 
     129        3592 :   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
     130        3592 :   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     131             : 
     132        1796 :   if (Subtarget->has16BitInsts()) {
     133        1544 :     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
     134        1544 :     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
     135             :   }
     136             : 
     137        1796 :   if (Subtarget->hasVOP3PInsts()) {
     138         254 :     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
     139         254 :     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
     140             :   }
     141             : 
     142        3592 :   computeRegisterProperties(STI.getRegisterInfo());
     143             : 
     144             :   // We need to custom lower vector stores from local memory
     145        3592 :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     146        3592 :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     147        3592 :   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
     148        3592 :   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
     149        3592 :   setOperationAction(ISD::LOAD, MVT::i1, Custom);
     150             : 
     151        3592 :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     152        3592 :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     153        3592 :   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
     154        3592 :   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
     155        3592 :   setOperationAction(ISD::STORE, MVT::i1, Custom);
     156             : 
     157        3592 :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     158        3592 :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
     159        3592 :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
     160        3592 :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
     161        3592 :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
     162        3592 :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
     163        3592 :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
     164        3592 :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
     165        3592 :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
     166        3592 :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
     167             : 
     168        3592 :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     169        3592 :   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     170        3592 :   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
     171             : 
     172        3592 :   setOperationAction(ISD::SELECT, MVT::i1, Promote);
     173        3592 :   setOperationAction(ISD::SELECT, MVT::i64, Custom);
     174        3592 :   setOperationAction(ISD::SELECT, MVT::f64, Promote);
     175        3592 :   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
     176             : 
     177        3592 :   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     178        3592 :   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
     179        3592 :   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
     180        3592 :   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     181        3592 :   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
     182             : 
     183        3592 :   setOperationAction(ISD::SETCC, MVT::i1, Promote);
     184        3592 :   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     185        3592 :   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     186        3592 :   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
     187             : 
     188        3592 :   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
     189        3592 :   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
     190             : 
     191        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
     192        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
     193        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
     194        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
     195        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
     196        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
     197        3592 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
     198             : 
     199        3592 :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     200        3592 :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
     201        3592 :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
     202        3592 :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
     203             : 
     204        3592 :   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
     205             : 
     206        3592 :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     207        3592 :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
     208        3592 :   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
     209             : 
     210        3592 :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     211        3592 :   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
     212        3592 :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     213        3592 :   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
     214        3592 :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     215        3592 :   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     216             : 
     217        3592 :   setOperationAction(ISD::UADDO, MVT::i32, Legal);
     218        3592 :   setOperationAction(ISD::USUBO, MVT::i32, Legal);
     219             : 
     220        3592 :   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
     221        3592 :   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
     222             : 
     223             :   // We only support LOAD/STORE and vector manipulation ops for vectors
     224             :   // with > 4 elements.
     225       10776 :   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
     226       14368 :         MVT::v2i64, MVT::v2f64}) {
     227     2790984 :     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     228     2780208 :       switch (Op) {
     229             :       case ISD::LOAD:
     230             :       case ISD::STORE:
     231             :       case ISD::BUILD_VECTOR:
     232             :       case ISD::BITCAST:
     233             :       case ISD::EXTRACT_VECTOR_ELT:
     234             :       case ISD::INSERT_VECTOR_ELT:
     235             :       case ISD::INSERT_SUBVECTOR:
     236             :       case ISD::EXTRACT_SUBVECTOR:
     237             :       case ISD::SCALAR_TO_VECTOR:
     238             :         break;
     239       10776 :       case ISD::CONCAT_VECTORS:
     240       10776 :         setOperationAction(Op, VT, Custom);
     241             :         break;
     242     2672448 :       default:
     243     2672448 :         setOperationAction(Op, VT, Expand);
     244             :         break;
     245             :       }
     246             :     }
     247             :   }
     248             : 
     249             :   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
     250             :   // is expanded to avoid having two separate loops in case the index is a VGPR.
     251             : 
     252             :   // Most operations are naturally 32-bit vector operations. We only support
     253             :   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
     254       12572 :   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
     255        7184 :     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
     256        7184 :     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
     257             : 
     258        7184 :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     259        7184 :     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
     260             : 
     261        7184 :     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
     262        7184 :     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
     263             : 
     264        7184 :     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
     265        7184 :     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
     266             :   }
     267             : 
     268        3592 :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     269        3592 :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     270        3592 :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     271        3592 :   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     272             : 
     273             :   // Avoid stack access for these.
     274             :   // TODO: Generalize to more vector types.
     275        3592 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
     276        3592 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
     277        3592 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     278        3592 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     279             : 
     280             :   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
     281             :   // and output demarshalling
     282        3592 :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
     283        3592 :   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
     284             : 
     285             :   // We can't return success/failure, only the old value,
     286             :   // let LLVM add the comparison
     287        3592 :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
     288        3592 :   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
     289             : 
     290        1796 :   if (getSubtarget()->hasFlatAddressSpace()) {
     291        2156 :     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
     292        2156 :     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
     293             :   }
     294             : 
     295        3592 :   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
     296        3592 :   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
     297             : 
     298             :   // On SI this is s_memtime and s_memrealtime on VI.
     299        3592 :   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
     300        3592 :   setOperationAction(ISD::TRAP, MVT::Other, Custom);
     301        3592 :   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
     302             : 
     303        3592 :   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
     304        3592 :   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     305             : 
     306        1796 :   if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
     307        1926 :     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     308        1926 :     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     309        1926 :     setOperationAction(ISD::FRINT, MVT::f64, Legal);
     310             :   }
     311             : 
     312        3592 :   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     313             : 
     314        3592 :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     315        3592 :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     316        3592 :   setOperationAction(ISD::FDIV, MVT::f32, Custom);
     317        3592 :   setOperationAction(ISD::FDIV, MVT::f64, Custom);
     318             : 
     319        1796 :   if (Subtarget->has16BitInsts()) {
     320        1544 :     setOperationAction(ISD::Constant, MVT::i16, Legal);
     321             : 
     322        1544 :     setOperationAction(ISD::SMIN, MVT::i16, Legal);
     323        1544 :     setOperationAction(ISD::SMAX, MVT::i16, Legal);
     324             : 
     325        1544 :     setOperationAction(ISD::UMIN, MVT::i16, Legal);
     326        1544 :     setOperationAction(ISD::UMAX, MVT::i16, Legal);
     327             : 
     328        1544 :     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
     329        1544 :     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
     330             : 
     331        1544 :     setOperationAction(ISD::ROTR, MVT::i16, Promote);
     332        1544 :     setOperationAction(ISD::ROTL, MVT::i16, Promote);
     333             : 
     334        1544 :     setOperationAction(ISD::SDIV, MVT::i16, Promote);
     335        1544 :     setOperationAction(ISD::UDIV, MVT::i16, Promote);
     336        1544 :     setOperationAction(ISD::SREM, MVT::i16, Promote);
     337        1544 :     setOperationAction(ISD::UREM, MVT::i16, Promote);
     338             : 
     339        1544 :     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     340        1544 :     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
     341             : 
     342        1544 :     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
     343        1544 :     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
     344        1544 :     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
     345        1544 :     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
     346             : 
     347        1544 :     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
     348             : 
     349        1544 :     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
     350             : 
     351        1544 :     setOperationAction(ISD::LOAD, MVT::i16, Custom);
     352             : 
     353        1544 :     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
     354             : 
     355        1544 :     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
     356        1544 :     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
     357        1544 :     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
     358        1544 :     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
     359             : 
     360        1544 :     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
     361        1544 :     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     362        1544 :     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
     363        1544 :     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     364             : 
     365             :     // F16 - Constant Actions.
     366        1544 :     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
     367             : 
     368             :     // F16 - Load/Store Actions.
     369        1544 :     setOperationAction(ISD::LOAD, MVT::f16, Promote);
     370        1544 :     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
     371        1544 :     setOperationAction(ISD::STORE, MVT::f16, Promote);
     372        1544 :     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
     373             : 
     374             :     // F16 - VOP1 Actions.
     375        1544 :     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
     376        1544 :     setOperationAction(ISD::FCOS, MVT::f16, Promote);
     377        1544 :     setOperationAction(ISD::FSIN, MVT::f16, Promote);
     378        1544 :     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
     379        1544 :     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     380        1544 :     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     381        1544 :     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
     382        1544 :     setOperationAction(ISD::FROUND, MVT::f16, Custom);
     383             : 
     384             :     // F16 - VOP2 Actions.
     385        1544 :     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     386        1544 :     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
     387        1544 :     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     388        1544 :     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     389        1544 :     setOperationAction(ISD::FDIV, MVT::f16, Custom);
     390             : 
     391             :     // F16 - VOP3 Actions.
     392        1544 :     setOperationAction(ISD::FMA, MVT::f16, Legal);
     393         772 :     if (!Subtarget->hasFP16Denormals())
     394          52 :       setOperationAction(ISD::FMAD, MVT::f16, Legal);
     395             :   }
     396             : 
     397        1796 :   if (Subtarget->hasVOP3PInsts()) {
     398         889 :     for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
     399       65786 :       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
     400       65532 :         switch (Op) {
     401             :         case ISD::LOAD:
     402             :         case ISD::STORE:
     403             :         case ISD::BUILD_VECTOR:
     404             :         case ISD::BITCAST:
     405             :         case ISD::EXTRACT_VECTOR_ELT:
     406             :         case ISD::INSERT_VECTOR_ELT:
     407             :         case ISD::INSERT_SUBVECTOR:
     408             :         case ISD::EXTRACT_SUBVECTOR:
     409             :         case ISD::SCALAR_TO_VECTOR:
     410             :           break;
     411         254 :         case ISD::CONCAT_VECTORS:
     412         254 :           setOperationAction(Op, VT, Custom);
     413             :           break;
     414       62992 :         default:
     415       62992 :           setOperationAction(Op, VT, Expand);
     416             :           break;
     417             :         }
     418             :       }
     419             :     }
     420             : 
     421             :     // XXX - Do these do anything? Vector constants turn into build_vector.
     422         254 :     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     423         254 :     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
     424             : 
     425         254 :     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     426         254 :     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
     427         254 :     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
     428         254 :     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
     429             : 
     430         254 :     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
     431         254 :     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
     432         254 :     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
     433         254 :     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
     434             : 
     435         254 :     setOperationAction(ISD::AND, MVT::v2i16, Promote);
     436         254 :     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
     437         254 :     setOperationAction(ISD::OR, MVT::v2i16, Promote);
     438         254 :     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
     439         254 :     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
     440         254 :     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
     441         254 :     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
     442         254 :     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
     443         254 :     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
     444         254 :     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
     445             : 
     446         254 :     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
     447         254 :     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
     448         254 :     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
     449         254 :     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
     450         254 :     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
     451         254 :     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
     452         254 :     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
     453         254 :     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
     454         254 :     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
     455         254 :     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
     456             : 
     457         254 :     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     458         254 :     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
     459         254 :     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     460         254 :     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
     461         254 :     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
     462         254 :     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
     463             : 
     464             :     // This isn't really legal, but this avoids the legalizer unrolling it (and
     465             :     // allows matching fneg (fabs x) patterns)
     466         254 :     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
     467             : 
     468         254 :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
     469         254 :     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
     470             : 
     471         254 :     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     472         254 :     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
     473         254 :     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
     474             :   } else {
     475        3338 :     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
     476        3338 :     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
     477             :   }
     478             : 
     479       28736 :   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
     480       17960 :     setOperationAction(ISD::SELECT, VT, Custom);
     481             :   }
     482             : 
     483        3592 :   setTargetDAGCombine(ISD::ADD);
     484        3592 :   setTargetDAGCombine(ISD::ADDCARRY);
     485        3592 :   setTargetDAGCombine(ISD::SUB);
     486        3592 :   setTargetDAGCombine(ISD::SUBCARRY);
     487        3592 :   setTargetDAGCombine(ISD::FADD);
     488        3592 :   setTargetDAGCombine(ISD::FSUB);
     489        3592 :   setTargetDAGCombine(ISD::FMINNUM);
     490        3592 :   setTargetDAGCombine(ISD::FMAXNUM);
     491        3592 :   setTargetDAGCombine(ISD::SMIN);
     492        3592 :   setTargetDAGCombine(ISD::SMAX);
     493        3592 :   setTargetDAGCombine(ISD::UMIN);
     494        3592 :   setTargetDAGCombine(ISD::UMAX);
     495        3592 :   setTargetDAGCombine(ISD::SETCC);
     496        3592 :   setTargetDAGCombine(ISD::AND);
     497        3592 :   setTargetDAGCombine(ISD::OR);
     498        3592 :   setTargetDAGCombine(ISD::XOR);
     499        3592 :   setTargetDAGCombine(ISD::SINT_TO_FP);
     500        3592 :   setTargetDAGCombine(ISD::UINT_TO_FP);
     501        3592 :   setTargetDAGCombine(ISD::FCANONICALIZE);
     502        3592 :   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
     503        3592 :   setTargetDAGCombine(ISD::ZERO_EXTEND);
     504        3592 :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     505             : 
     506             :   // All memory operations. Some folding on the pointer operand is done to help
     507             :   // matching the constant offsets in the addressing modes.
     508        3592 :   setTargetDAGCombine(ISD::LOAD);
     509        3592 :   setTargetDAGCombine(ISD::STORE);
     510        3592 :   setTargetDAGCombine(ISD::ATOMIC_LOAD);
     511        3592 :   setTargetDAGCombine(ISD::ATOMIC_STORE);
     512        3592 :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
     513        3592 :   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     514        3592 :   setTargetDAGCombine(ISD::ATOMIC_SWAP);
     515        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
     516        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
     517        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
     518        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
     519        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
     520        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
     521        1796 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
     522        3592 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
     523        3592 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
     524        3592 :   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
     525             : 
     526        3592 :   setSchedulingPreference(Sched::RegPressure);
     527        1796 : }
     528             : 
     529      443020 : const SISubtarget *SITargetLowering::getSubtarget() const {
     530      443020 :   return static_cast<const SISubtarget *>(Subtarget);
     531             : }
     532             : 
     533             : //===----------------------------------------------------------------------===//
     534             : // TargetLowering queries
     535             : //===----------------------------------------------------------------------===//
     536             : 
     537          18 : bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
     538             :   // SI has some legal vector types, but no legal vector operations. Say no
     539             :   // shuffles are legal in order to prefer scalarizing some vector operations.
     540          18 :   return false;
     541             : }
     542             : 
     543        7647 : bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     544             :                                           const CallInst &CI,
     545             :                                           unsigned IntrID) const {
     546        7647 :   switch (IntrID) {
     547         150 :   case Intrinsic::amdgcn_atomic_inc:
     548             :   case Intrinsic::amdgcn_atomic_dec: {
     549         150 :     Info.opc = ISD::INTRINSIC_W_CHAIN;
     550         150 :     Info.memVT = MVT::getVT(CI.getType());
     551         150 :     Info.ptrVal = CI.getOperand(0);
     552         150 :     Info.align = 0;
     553             : 
     554         298 :     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
     555         148 :     Info.vol = !Vol || !Vol->isZero();
     556         150 :     Info.readMem = true;
     557         150 :     Info.writeMem = true;
     558         150 :     return true;
     559             :   }
     560             :   default:
     561             :     return false;
     562             :   }
     563             : }
     564             : 
     565       11373 : bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
     566             :                                             SmallVectorImpl<Value*> &Ops,
     567             :                                             Type *&AccessTy) const {
     568       11373 :   switch (II->getIntrinsicID()) {
     569         174 :   case Intrinsic::amdgcn_atomic_inc:
     570             :   case Intrinsic::amdgcn_atomic_dec: {
     571         348 :     Value *Ptr = II->getArgOperand(0);
     572         174 :     AccessTy = II->getType();
     573         174 :     Ops.push_back(Ptr);
     574             :     return true;
     575             :   }
     576             :   default:
     577             :     return false;
     578             :   }
     579             : }
     580             : 
     581       29671 : bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
     582       29671 :   if (!Subtarget->hasFlatInstOffsets()) {
     583             :     // Flat instructions do not have offsets, and only have the register
     584             :     // address.
     585       28845 :     return AM.BaseOffs == 0 && AM.Scale == 0;
     586             :   }
     587             : 
     588             :   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
     589             :   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
     590             : 
     591             :   // Just r + i
     592         826 :   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
     593             : }
     594             : 
     595       63466 : bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
     596       63466 :   if (Subtarget->hasFlatGlobalInsts())
     597       18068 :     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
     598             : 
     599      108864 :   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
     600             :       // Assume the we will use FLAT for all global memory accesses
     601             :       // on VI.
     602             :       // FIXME: This assumption is currently wrong.  On VI we still use
     603             :       // MUBUF instructions for the r + i addressing mode.  As currently
     604             :       // implemented, the MUBUF instructions only work on buffer < 4GB.
     605             :       // It may be possible to support > 4GB buffers with MUBUF instructions,
     606             :       // by setting the stride value in the resource descriptor which would
     607             :       // increase the size limit to (stride * 4GB).  However, this is risky,
     608             :       // because it has never been validated.
     609       24052 :     return isLegalFlatAddressingMode(AM);
     610             :   }
     611             : 
     612       30380 :   return isLegalMUBUFAddressingMode(AM);
     613             : }
     614             : 
     615       34841 : bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
     616             :   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     617             :   // additionally can do r + r + i with addr64. 32-bit has more addressing
     618             :   // mode options. Depending on the resource constant, it can also do
     619             :   // (i64 r0) + (i32 r1) * (i14 i).
     620             :   //
     621             :   // Private arrays end up using a scratch buffer most of the time, so also
     622             :   // assume those use MUBUF instructions. Scratch loads / stores are currently
     623             :   // implemented as mubuf instructions with offen bit set, so slightly
     624             :   // different than the normal addr64.
     625       34841 :   if (!isUInt<12>(AM.BaseOffs))
     626             :     return false;
     627             : 
     628             :   // FIXME: Since we can split immediate into soffset and immediate offset,
     629             :   // would it make sense to allow any immediate?
     630             : 
     631       34456 :   switch (AM.Scale) {
     632             :   case 0: // r + i or just i, depending on HasBaseReg.
     633             :     return true;
     634             :   case 1:
     635             :     return true; // We have r + r or r + i.
     636         778 :   case 2:
     637         778 :     if (AM.HasBaseReg) {
     638             :       // Reject 2 * r + r.
     639             :       return false;
     640             :     }
     641             : 
     642             :     // Allow 2 * r as r + r
     643             :     // Or  2 * r + i is allowed as r + r + i.
     644           0 :     return true;
     645       12571 :   default: // Don't allow n * r
     646       12571 :     return false;
     647             :   }
     648             : }
     649             : 
     650       89884 : bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     651             :                                              const AddrMode &AM, Type *Ty,
     652             :                                              unsigned AS, Instruction *I) const {
     653             :   // No global is ever allowed as a base.
     654       89884 :   if (AM.BaseGV)
     655             :     return false;
     656             : 
     657       88202 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
     658       63327 :     return isLegalGlobalAddressingMode(AM);
     659             : 
     660       24875 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     661             :     // If the offset isn't a multiple of 4, it probably isn't going to be
     662             :     // correctly aligned.
     663             :     // FIXME: Can we get the real alignment here?
     664        2732 :     if (AM.BaseOffs % 4 != 0)
     665          40 :       return isLegalMUBUFAddressingMode(AM);
     666             : 
     667             :     // There are no SMRD extloads, so if we have to do a small type access we
     668             :     // will use a MUBUF load.
     669             :     // FIXME?: We also need to do this if unaligned, but we don't know the
     670             :     // alignment here.
     671        2692 :     if (DL.getTypeStoreSize(Ty) < 4)
     672         139 :       return isLegalGlobalAddressingMode(AM);
     673             : 
     674        2553 :     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
     675             :       // SMRD instructions have an 8-bit, dword offset on SI.
     676         824 :       if (!isUInt<8>(AM.BaseOffs / 4))
     677             :         return false;
     678        1729 :     } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
     679             :       // On CI+, this can also be a 32-bit literal constant offset. If it fits
     680             :       // in 8-bits, it can use a smaller encoding.
     681         759 :       if (!isUInt<32>(AM.BaseOffs / 4))
     682             :         return false;
     683         970 :     } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     684             :       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
     685         970 :       if (!isUInt<20>(AM.BaseOffs))
     686             :         return false;
     687             :     } else
     688           0 :       llvm_unreachable("unhandled generation");
     689             : 
     690        2447 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     691             :       return true;
     692             : 
     693         257 :     if (AM.Scale == 1 && AM.HasBaseReg)
     694             :       return true;
     695             : 
     696         257 :     return false;
     697             : 
     698       22143 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     699        4421 :     return isLegalMUBUFAddressingMode(AM);
     700       23341 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
     701        5619 :              AS == AMDGPUASI.REGION_ADDRESS) {
     702             :     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     703             :     // field.
     704             :     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
     705             :     // an 8-bit dword offset but we don't know the alignment here.
     706       12103 :     if (!isUInt<16>(AM.BaseOffs))
     707             :       return false;
     708             : 
     709       10740 :     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
     710             :       return true;
     711             : 
     712        2439 :     if (AM.Scale == 1 && AM.HasBaseReg)
     713             :       return true;
     714             : 
     715        1602 :     return false;
     716        5619 :   } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
     717             :              AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     718             :     // For an unknown address space, this usually means that this is for some
     719             :     // reason being used for pure arithmetic, and not based on some addressing
     720             :     // computation. We don't have instructions that compute pointers with any
     721             :     // addressing modes, so treat them as having no offset like flat
     722             :     // instructions.
     723        5619 :     return isLegalFlatAddressingMode(AM);
     724             :   } else {
     725           0 :     llvm_unreachable("unhandled address space");
     726             :   }
     727             : }
     728             : 
     729       12214 : bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
     730             :                                         const SelectionDAG &DAG) const {
     731       12214 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
     732        7044 :     return (MemVT.getSizeInBits() <= 4 * 32);
     733        5170 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     734         977 :     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
     735         977 :     return (MemVT.getSizeInBits() <= MaxPrivateBits);
     736        4193 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     737        4193 :     return (MemVT.getSizeInBits() <= 2 * 32);
     738             :   }
     739             :   return true;
     740             : }
     741             : 
     742       23563 : bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     743             :                                                       unsigned AddrSpace,
     744             :                                                       unsigned Align,
     745             :                                                       bool *IsFast) const {
     746       23563 :   if (IsFast)
     747       18086 :     *IsFast = false;
     748             : 
     749             :   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
     750             :   // which isn't a simple VT.
     751             :   // Until MVT is extended to handle this, simply check for the size and
     752             :   // rely on the condition below: allow accesses if the size is a multiple of 4.
     753       70689 :   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
     754           0 :                            VT.getStoreSize() > 16)) {
     755             :     return false;
     756             :   }
     757             : 
     758       40185 :   if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
     759       16622 :       AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     760             :     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     761             :     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     762             :     // with adjacent offsets.
     763        6941 :     bool AlignedBy4 = (Align % 4 == 0);
     764        6941 :     if (IsFast)
     765        5048 :       *IsFast = AlignedBy4;
     766             : 
     767             :     return AlignedBy4;
     768             :   }
     769             : 
     770             :   // FIXME: We have to be conservative here and assume that flat operations
     771             :   // will access scratch.  If we had access to the IR function, then we
     772             :   // could determine if any private memory was used in the function.
     773       33212 :   if (!Subtarget->hasUnalignedScratchAccess() &&
     774       32949 :       (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
     775       16359 :        AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     776             :     return false;
     777             :   }
     778             : 
     779       16367 :   if (Subtarget->hasUnalignedBufferAccess()) {
     780             :     // If we have an uniform constant load, it still requires using a slow
     781             :     // buffer instruction if unaligned.
     782        5402 :     if (IsFast) {
     783        4076 :       *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
     784         165 :         (Align % 4 == 0) : true;
     785             :     }
     786             : 
     787             :     return true;
     788             :   }
     789             : 
     790             :   // Smaller than dword value must be aligned.
     791       10965 :   if (VT.bitsLT(MVT::i32))
     792             :     return false;
     793             : 
     794             :   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
     795             :   // byte-address are ignored, thus forcing Dword alignment.
     796             :   // This applies to private, global, and constant memory.
     797        9219 :   if (IsFast)
     798        7346 :     *IsFast = true;
     799             : 
     800        9219 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
     801             : }
     802             : 
     803         112 : EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
     804             :                                           unsigned SrcAlign, bool IsMemset,
     805             :                                           bool ZeroMemset,
     806             :                                           bool MemcpyStrSrc,
     807             :                                           MachineFunction &MF) const {
     808             :   // FIXME: Should account for address space here.
     809             : 
     810             :   // The default fallback uses the private pointer size as a guess for a type to
     811             :   // use. Make sure we switch these to 64-bit accesses.
     812             : 
     813         112 :   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
     814          86 :     return MVT::v4i32;
     815             : 
     816          26 :   if (Size >= 8 && DstAlign >= 4)
     817           8 :     return MVT::v2i32;
     818             : 
     819             :   // Use the default.
     820          18 :   return MVT::Other;
     821             : }
     822             : 
     823             : static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
     824         472 :   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
     825         798 :          AS == AMDGPUASI.FLAT_ADDRESS ||
     826             :          AS == AMDGPUASI.CONSTANT_ADDRESS;
     827             : }
     828             : 
     829         219 : bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
     830             :                                            unsigned DestAS) const {
     831         219 :   return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
     832         361 :          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
     833             : }
     834             : 
     835        3046 : bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
     836        3046 :   const MemSDNode *MemNode = cast<MemSDNode>(N);
     837        6092 :   const Value *Ptr = MemNode->getMemOperand()->getValue();
     838        2259 :   const Instruction *I = dyn_cast<Instruction>(Ptr);
     839        4518 :   return I && I->getMetadata("amdgpu.noclobber");
     840             : }
     841             : 
     842          77 : bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
     843             :                                             unsigned DestAS) const {
     844             :   // Flat -> private/local is a simple truncate.
     845             :   // Flat -> global is no-op
     846          77 :   if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
     847             :     return true;
     848             : 
     849          21 :   return isNoopAddrSpaceCast(SrcAS, DestAS);
     850             : }
     851             : 
     852      114941 : bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
     853      114941 :   const MemSDNode *MemNode = cast<MemSDNode>(N);
     854             : 
     855      114941 :   return AMDGPU::isUniformMMO(MemNode->getMemOperand());
     856             : }
     857             : 
     858             : TargetLoweringBase::LegalizeTypeAction
     859      150610 : SITargetLowering::getPreferredVectorAction(EVT VT) const {
     860      274280 :   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
     861             :     return TypeSplitVector;
     862             : 
     863       71840 :   return TargetLoweringBase::getPreferredVectorAction(VT);
     864             : }
     865             : 
     866          32 : bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
     867             :                                                          Type *Ty) const {
     868             :   // FIXME: Could be smarter if called for vector constants.
     869          32 :   return true;
     870             : }
     871             : 
     872      233584 : bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
     873      346301 :   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
     874       11649 :     switch (Op) {
     875             :     case ISD::LOAD:
     876             :     case ISD::STORE:
     877             : 
     878             :     // These operations are done with 32-bit instructions anyway.
     879             :     case ISD::AND:
     880             :     case ISD::OR:
     881             :     case ISD::XOR:
     882             :     case ISD::SELECT:
     883             :       // TODO: Extensions?
     884             :       return true;
     885        7643 :     default:
     886        7643 :       return false;
     887             :     }
     888             :   }
     889             : 
     890             :   // SimplifySetCC uses this function to determine whether or not it should
     891             :   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
     892      222356 :   if (VT == MVT::i1 && Op == ISD::SETCC)
     893             :     return false;
     894             : 
     895      221912 :   return TargetLowering::isTypeDesirableForOp(Op, VT);
     896             : }
     897             : 
     898       32206 : SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
     899             :                                                    const SDLoc &SL,
     900             :                                                    SDValue Chain,
     901             :                                                    uint64_t Offset) const {
     902       64412 :   const DataLayout &DL = DAG.getDataLayout();
     903       32206 :   MachineFunction &MF = DAG.getMachineFunction();
     904       32206 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
     905             : 
     906             :   const ArgDescriptor *InputPtrReg;
     907             :   const TargetRegisterClass *RC;
     908             : 
     909             :   std::tie(InputPtrReg, RC)
     910       96618 :     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
     911             : 
     912       32206 :   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     913       64412 :   MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
     914             :   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     915       32206 :     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
     916             : 
     917             :   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
     918       96618 :                      DAG.getConstant(Offset, SL, PtrVT));
     919             : }
     920             : 
     921          28 : SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
     922             :                                             const SDLoc &SL) const {
     923          28 :   auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
     924          28 :   uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
     925          28 :   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
     926             : }
     927             : 
     928       32178 : SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
     929             :                                          const SDLoc &SL, SDValue Val,
     930             :                                          bool Signed,
     931             :                                          const ISD::InputArg *Arg) const {
     932       96407 :   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
     933          83 :       VT.bitsLT(MemVT)) {
     934          92 :     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
     935          46 :     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
     936             :   }
     937             : 
     938       32178 :   if (MemVT.isFloatingPoint())
     939        2426 :     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
     940       29752 :   else if (Signed)
     941          16 :     Val = DAG.getSExtOrTrunc(Val, SL, VT);
     942             :   else
     943       29736 :     Val = DAG.getZExtOrTrunc(Val, SL, VT);
     944             : 
     945       32178 :   return Val;
     946             : }
     947             : 
     948       32178 : SDValue SITargetLowering::lowerKernargMemParameter(
     949             :   SelectionDAG &DAG, EVT VT, EVT MemVT,
     950             :   const SDLoc &SL, SDValue Chain,
     951             :   uint64_t Offset, bool Signed,
     952             :   const ISD::InputArg *Arg) const {
     953       64356 :   const DataLayout &DL = DAG.getDataLayout();
     954       32178 :   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     955       32178 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
     956       64356 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
     957             : 
     958       32178 :   unsigned Align = DL.getABITypeAlignment(Ty);
     959             : 
     960       32178 :   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
     961             :   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
     962       32178 :                              MachineMemOperand::MONonTemporal |
     963       32178 :                              MachineMemOperand::MODereferenceable |
     964       64356 :                              MachineMemOperand::MOInvariant);
     965             : 
     966       32178 :   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
     967       96534 :   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
     968             : }
     969             : 
     970         184 : SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
     971             :                                               const SDLoc &SL, SDValue Chain,
     972             :                                               const ISD::InputArg &Arg) const {
     973         184 :   MachineFunction &MF = DAG.getMachineFunction();
     974         184 :   MachineFrameInfo &MFI = MF.getFrameInfo();
     975             : 
     976         368 :   if (Arg.Flags.isByVal()) {
     977          33 :     unsigned Size = Arg.Flags.getByValSize();
     978          33 :     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
     979          33 :     return DAG.getFrameIndex(FrameIdx, MVT::i32);
     980             :   }
     981             : 
     982         151 :   unsigned ArgOffset = VA.getLocMemOffset();
     983         302 :   unsigned ArgSize = VA.getValVT().getStoreSize();
     984             : 
     985         151 :   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
     986             : 
     987             :   // Create load nodes to retrieve arguments from the stack.
     988         151 :   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
     989         151 :   SDValue ArgValue;
     990             : 
     991             :   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
     992         151 :   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
     993         151 :   MVT MemVT = VA.getValVT();
     994             : 
     995         151 :   switch (VA.getLocInfo()) {
     996             :   default:
     997             :     break;
     998           0 :   case CCValAssign::BCvt:
     999           0 :     MemVT = VA.getLocVT();
    1000           0 :     break;
    1001           0 :   case CCValAssign::SExt:
    1002           0 :     ExtType = ISD::SEXTLOAD;
    1003           0 :     break;
    1004           0 :   case CCValAssign::ZExt:
    1005           0 :     ExtType = ISD::ZEXTLOAD;
    1006           0 :     break;
    1007           3 :   case CCValAssign::AExt:
    1008           3 :     ExtType = ISD::EXTLOAD;
    1009           3 :     break;
    1010             :   }
    1011             : 
    1012         151 :   ArgValue = DAG.getExtLoad(
    1013             :     ExtType, SL, VA.getLocVT(), Chain, FIN,
    1014             :     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
    1015         604 :     MemVT);
    1016         151 :   return ArgValue;
    1017             : }
    1018             : 
    1019         169 : SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
    1020             :   const SIMachineFunctionInfo &MFI,
    1021             :   EVT VT,
    1022             :   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
    1023             :   const ArgDescriptor *Reg;
    1024             :   const TargetRegisterClass *RC;
    1025             : 
    1026         507 :   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
    1027         169 :   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
    1028             : }
    1029             : 
    1030         547 : static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
    1031             :                                    CallingConv::ID CallConv,
    1032             :                                    ArrayRef<ISD::InputArg> Ins,
    1033             :                                    BitVector &Skipped,
    1034             :                                    FunctionType *FType,
    1035             :                                    SIMachineFunctionInfo *Info) {
    1036        2980 :   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
    1037        4866 :     const ISD::InputArg &Arg = Ins[I];
    1038             : 
    1039             :     // First check if it's a PS input addr.
    1040        5665 :     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
    1041        6985 :         !Arg.Flags.isByVal() && PSInputNum <= 15) {
    1042             : 
    1043        3193 :       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
    1044             :         // We can safely skip PS inputs.
    1045         852 :         Skipped.set(I);
    1046         852 :         ++PSInputNum;
    1047         852 :         continue;
    1048             :       }
    1049             : 
    1050        1188 :       Info->markPSInputAllocated(PSInputNum);
    1051         594 :       if (Arg.Used)
    1052         551 :         Info->markPSInputEnabled(PSInputNum);
    1053             : 
    1054         594 :       ++PSInputNum;
    1055             :     }
    1056             : 
    1057             :     // Second split vertices into their elements.
    1058        3162 :     if (Arg.VT.isVector()) {
    1059         447 :       ISD::InputArg NewArg = Arg;
    1060         447 :       NewArg.Flags.setSplit();
    1061         447 :       NewArg.VT = Arg.VT.getVectorElementType();
    1062             : 
    1063             :       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
    1064             :       // three or five element vertex only needs three or five registers,
    1065             :       // NOT four or eight.
    1066         894 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1067         447 :       unsigned NumElements = ParamType->getVectorNumElements();
    1068             : 
    1069        2317 :       for (unsigned J = 0; J != NumElements; ++J) {
    1070        1870 :         Splits.push_back(NewArg);
    1071        1870 :         NewArg.PartOffset += NewArg.VT.getStoreSize();
    1072             :       }
    1073             :     } else {
    1074        1134 :       Splits.push_back(Arg);
    1075             :     }
    1076             :   }
    1077         547 : }
    1078             : 
    1079             : // Allocate special inputs passed in VGPRs.
    1080       14158 : static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
    1081             :                                            MachineFunction &MF,
    1082             :                                            const SIRegisterInfo &TRI,
    1083             :                                            SIMachineFunctionInfo &Info) {
    1084       14158 :   if (Info.hasWorkItemIDX()) {
    1085       13611 :     unsigned Reg = AMDGPU::VGPR0;
    1086       13611 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1087             : 
    1088       13611 :     CCInfo.AllocateReg(Reg);
    1089       13611 :     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
    1090             :   }
    1091             : 
    1092       14158 :   if (Info.hasWorkItemIDY()) {
    1093         105 :     unsigned Reg = AMDGPU::VGPR1;
    1094         105 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1095             : 
    1096         105 :     CCInfo.AllocateReg(Reg);
    1097         105 :     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
    1098             :   }
    1099             : 
    1100       14158 :   if (Info.hasWorkItemIDZ()) {
    1101          58 :     unsigned Reg = AMDGPU::VGPR2;
    1102          58 :     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1103             : 
    1104          58 :     CCInfo.AllocateReg(Reg);
    1105          58 :     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
    1106             :   }
    1107       14158 : }
    1108             : 
    1109             : // Try to allocate a VGPR at the end of the argument list, or if no argument
    1110             : // VGPRs are left allocating a stack slot.
    1111          29 : static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
    1112             :   ArrayRef<MCPhysReg> ArgVGPRs
    1113          87 :     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    1114          29 :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
    1115          29 :   if (RegIdx == ArgVGPRs.size()) {
    1116             :     // Spill to stack required.
    1117           8 :     int64_t Offset = CCInfo.AllocateStack(4, 4);
    1118             : 
    1119           8 :     return ArgDescriptor::createStack(Offset);
    1120             :   }
    1121             : 
    1122          42 :   unsigned Reg = ArgVGPRs[RegIdx];
    1123          21 :   Reg = CCInfo.AllocateReg(Reg);
    1124             :   assert(Reg != AMDGPU::NoRegister);
    1125             : 
    1126          21 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1127          21 :   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
    1128             :   return ArgDescriptor::createRegister(Reg);
    1129             : }
    1130             : 
    1131         115 : static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
    1132             :                                              const TargetRegisterClass *RC,
    1133             :                                              unsigned NumArgRegs) {
    1134         345 :   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
    1135         115 :   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
    1136         115 :   if (RegIdx == ArgSGPRs.size())
    1137           0 :     report_fatal_error("ran out of SGPRs for arguments");
    1138             : 
    1139         230 :   unsigned Reg = ArgSGPRs[RegIdx];
    1140         115 :   Reg = CCInfo.AllocateReg(Reg);
    1141             :   assert(Reg != AMDGPU::NoRegister);
    1142             : 
    1143         115 :   MachineFunction &MF = CCInfo.getMachineFunction();
    1144         115 :   MF.addLiveIn(Reg, RC);
    1145         115 :   return ArgDescriptor::createRegister(Reg);
    1146             : }
    1147             : 
    1148             : static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
    1149          62 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
    1150             : }
    1151             : 
    1152             : static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
    1153          53 :   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
    1154             : }
    1155             : 
    1156         659 : static void allocateSpecialInputVGPRs(CCState &CCInfo,
    1157             :                                       MachineFunction &MF,
    1158             :                                       const SIRegisterInfo &TRI,
    1159             :                                       SIMachineFunctionInfo &Info) {
    1160         659 :   if (Info.hasWorkItemIDX())
    1161          13 :     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
    1162             : 
    1163         659 :   if (Info.hasWorkItemIDY())
    1164           8 :     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
    1165             : 
    1166         659 :   if (Info.hasWorkItemIDZ())
    1167           8 :     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
    1168         659 : }
    1169             : 
    1170         659 : static void allocateSpecialInputSGPRs(CCState &CCInfo,
    1171             :                                       MachineFunction &MF,
    1172             :                                       const SIRegisterInfo &TRI,
    1173             :                                       SIMachineFunctionInfo &Info) {
    1174         659 :   auto &ArgInfo = Info.getArgInfo();
    1175             : 
    1176             :   // TODO: Unify handling with private memory pointers.
    1177             : 
    1178         659 :   if (Info.hasDispatchPtr())
    1179          10 :     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
    1180             : 
    1181         659 :   if (Info.hasQueuePtr())
    1182          11 :     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
    1183             : 
    1184         659 :   if (Info.hasKernargSegmentPtr())
    1185          13 :     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
    1186             : 
    1187         659 :   if (Info.hasDispatchID())
    1188          10 :     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
    1189             : 
    1190             :   // flat_scratch_init is not applicable for non-kernel functions.
    1191             : 
    1192         659 :   if (Info.hasWorkGroupIDX())
    1193          22 :     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
    1194             : 
    1195         659 :   if (Info.hasWorkGroupIDY())
    1196          20 :     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
    1197             : 
    1198         659 :   if (Info.hasWorkGroupIDZ())
    1199          20 :     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
    1200             : 
    1201         659 :   if (Info.hasImplicitArgPtr())
    1202           9 :     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
    1203         659 : }
    1204             : 
    1205             : // Allocate special inputs passed in user SGPRs.
    1206       14158 : static void allocateHSAUserSGPRs(CCState &CCInfo,
    1207             :                                  MachineFunction &MF,
    1208             :                                  const SIRegisterInfo &TRI,
    1209             :                                  SIMachineFunctionInfo &Info) {
    1210       14158 :   if (Info.hasImplicitBufferPtr()) {
    1211           2 :     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
    1212           2 :     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    1213           2 :     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    1214             :   }
    1215             : 
    1216             :   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    1217       14158 :   if (Info.hasPrivateSegmentBuffer()) {
    1218        1742 :     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    1219        1742 :     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    1220        1742 :     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    1221             :   }
    1222             : 
    1223       14158 :   if (Info.hasDispatchPtr()) {
    1224          25 :     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
    1225          25 :     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    1226          25 :     CCInfo.AllocateReg(DispatchPtrReg);
    1227             :   }
    1228             : 
    1229       14158 :   if (Info.hasQueuePtr()) {
    1230          57 :     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
    1231          57 :     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    1232          57 :     CCInfo.AllocateReg(QueuePtrReg);
    1233             :   }
    1234             : 
    1235       14158 :   if (Info.hasKernargSegmentPtr()) {
    1236       12772 :     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
    1237       12772 :     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
    1238       12772 :     CCInfo.AllocateReg(InputPtrReg);
    1239             :   }
    1240             : 
    1241       14158 :   if (Info.hasDispatchID()) {
    1242           5 :     unsigned DispatchIDReg = Info.addDispatchID(TRI);
    1243           5 :     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    1244           5 :     CCInfo.AllocateReg(DispatchIDReg);
    1245             :   }
    1246             : 
    1247       14158 :   if (Info.hasFlatScratchInit()) {
    1248         332 :     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    1249         332 :     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    1250         332 :     CCInfo.AllocateReg(FlatScratchInitReg);
    1251             :   }
    1252             : 
    1253             :   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    1254             :   // these from the dispatch pointer.
    1255       14158 : }
    1256             : 
    1257             : // Allocate special input registers that are initialized per-wave.
    1258       14158 : static void allocateSystemSGPRs(CCState &CCInfo,
    1259             :                                 MachineFunction &MF,
    1260             :                                 SIMachineFunctionInfo &Info,
    1261             :                                 CallingConv::ID CallConv,
    1262             :                                 bool IsShader) {
    1263       14158 :   if (Info.hasWorkGroupIDX()) {
    1264       13611 :     unsigned Reg = Info.addWorkGroupIDX();
    1265       13611 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1266       13611 :     CCInfo.AllocateReg(Reg);
    1267             :   }
    1268             : 
    1269       14158 :   if (Info.hasWorkGroupIDY()) {
    1270          24 :     unsigned Reg = Info.addWorkGroupIDY();
    1271          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1272          24 :     CCInfo.AllocateReg(Reg);
    1273             :   }
    1274             : 
    1275       14158 :   if (Info.hasWorkGroupIDZ()) {
    1276          24 :     unsigned Reg = Info.addWorkGroupIDZ();
    1277          24 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1278          24 :     CCInfo.AllocateReg(Reg);
    1279             :   }
    1280             : 
    1281       14158 :   if (Info.hasWorkGroupInfo()) {
    1282           0 :     unsigned Reg = Info.addWorkGroupInfo();
    1283           0 :     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
    1284           0 :     CCInfo.AllocateReg(Reg);
    1285             :   }
    1286             : 
    1287       14158 :   if (Info.hasPrivateSegmentWaveByteOffset()) {
    1288             :     // Scratch wave offset passed in system SGPR.
    1289             :     unsigned PrivateSegmentWaveByteOffsetReg;
    1290             : 
    1291       13659 :     if (IsShader) {
    1292          48 :       PrivateSegmentWaveByteOffsetReg =
    1293             :         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
    1294             : 
    1295             :       // This is true if the scratch wave byte offset doesn't have a fixed
    1296             :       // location.
    1297          48 :       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
    1298          44 :         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
    1299             :         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
    1300             :       }
    1301             :     } else
    1302       13611 :       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
    1303             : 
    1304       13659 :     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
    1305       13659 :     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
    1306             :   }
    1307       14158 : }
    1308             : 
    1309       14174 : static void reservePrivateMemoryRegs(const TargetMachine &TM,
    1310             :                                      MachineFunction &MF,
    1311             :                                      const SIRegisterInfo &TRI,
    1312             :                                      SIMachineFunctionInfo &Info) {
    1313             :   // Now that we've figured out where the scratch register inputs are, see if
    1314             :   // should reserve the arguments and use them directly.
    1315       14174 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    1316       14174 :   bool HasStackObjects = MFI.hasStackObjects();
    1317             : 
    1318             :   // Record that we know we have non-spill stack objects so we don't need to
    1319             :   // check all stack objects later.
    1320       14174 :   if (HasStackObjects)
    1321             :     Info.setHasNonSpillStackObjects(true);
    1322             : 
    1323             :   // Everything live out of a block is spilled with fast regalloc, so it's
    1324             :   // almost certain that spilling will be required.
    1325       14174 :   if (TM.getOptLevel() == CodeGenOpt::None)
    1326             :     HasStackObjects = true;
    1327             : 
    1328             :   // For now assume stack access is needed in any callee functions, so we need
    1329             :   // the scratch registers to pass in.
    1330       13997 :   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
    1331             : 
    1332       14174 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1333       14174 :   if (ST.isAmdCodeObjectV2(MF)) {
    1334        1745 :     if (RequiresStackAccess) {
    1335             :       // If we have stack objects, we unquestionably need the private buffer
    1336             :       // resource. For the Code Object V2 ABI, this will be the first 4 user
    1337             :       // SGPR inputs. We can reserve those and use them directly.
    1338             : 
    1339             :       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
    1340         427 :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
    1341         854 :       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
    1342             : 
    1343         427 :       if (MFI.hasCalls()) {
    1344             :         // If we have calls, we need to keep the frame register in a register
    1345             :         // that won't be clobbered by a call, so ensure it is copied somewhere.
    1346             : 
    1347             :         // This is not a problem for the scratch wave offset, because the same
    1348             :         // registers are reserved in all functions.
    1349             : 
    1350             :         // FIXME: Nothing is really ensuring this is a call preserved register,
    1351             :         // it's just selected from the end so it happens to be.
    1352             :         unsigned ReservedOffsetReg
    1353         230 :           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1354             :         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1355             :       } else {
    1356             :         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
    1357         197 :           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1358             :         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
    1359             :       }
    1360             :     } else {
    1361             :       unsigned ReservedBufferReg
    1362        1318 :         = TRI.reservedPrivateSegmentBufferReg(MF);
    1363             :       unsigned ReservedOffsetReg
    1364        1318 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1365             : 
    1366             :       // We tentatively reserve the last registers (skipping the last two
    1367             :       // which may contain VCC). After register allocation, we'll replace
    1368             :       // these with the ones immediately after those which were really
    1369             :       // allocated. In the prologue copies will be inserted from the argument
    1370             :       // to these reserved registers.
    1371        2636 :       Info.setScratchRSrcReg(ReservedBufferReg);
    1372             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1373             :     }
    1374             :   } else {
    1375       12429 :     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
    1376             : 
    1377             :     // Without HSA, relocations are used for the scratch pointer and the
    1378             :     // buffer resource setup is always inserted in the prologue. Scratch wave
    1379             :     // offset is still in an input SGPR.
    1380       24858 :     Info.setScratchRSrcReg(ReservedBufferReg);
    1381             : 
    1382       12429 :     if (HasStackObjects && !MFI.hasCalls()) {
    1383             :       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
    1384         277 :         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
    1385             :       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
    1386             :     } else {
    1387             :       unsigned ReservedOffsetReg
    1388       12152 :         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
    1389             :       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
    1390             :     }
    1391             :   }
    1392       14174 : }
    1393             : 
    1394       14643 : bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
    1395       14643 :   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1396       14643 :   return !Info->isEntryFunction();
    1397             : }
    1398             : 
    1399         659 : void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
    1400             : 
    1401         659 : }
    1402             : 
    1403         659 : void SITargetLowering::insertCopiesSplitCSR(
    1404             :   MachineBasicBlock *Entry,
    1405             :   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
    1406        1318 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1407             : 
    1408         659 :   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
    1409         659 :   if (!IStart)
    1410           0 :     return;
    1411             : 
    1412         659 :   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    1413         659 :   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
    1414         659 :   MachineBasicBlock::iterator MBBI = Entry->begin();
    1415        1318 :   for (const MCPhysReg *I = IStart; *I; ++I) {
    1416         659 :     const TargetRegisterClass *RC = nullptr;
    1417        1318 :     if (AMDGPU::SReg_64RegClass.contains(*I))
    1418             :       RC = &AMDGPU::SGPR_64RegClass;
    1419        1318 :     else if (AMDGPU::SReg_32RegClass.contains(*I))
    1420             :       RC = &AMDGPU::SGPR_32RegClass;
    1421             :     else
    1422           0 :       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1423             : 
    1424         659 :     unsigned NewVR = MRI->createVirtualRegister(RC);
    1425             :     // Create copy from CSR to a virtual register.
    1426        1318 :     Entry->addLiveIn(*I);
    1427        3295 :     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
    1428         659 :       .addReg(*I);
    1429             : 
    1430             :     // Insert the copy-back instructions right before the terminator.
    1431        2632 :     for (auto *Exit : Exits)
    1432        3275 :       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
    1433        1965 :               TII->get(TargetOpcode::COPY), *I)
    1434         655 :         .addReg(NewVR);
    1435             :   }
    1436             : }
    1437             : 
    1438       14820 : SDValue SITargetLowering::LowerFormalArguments(
    1439             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1440             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1441             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1442       29640 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1443             : 
    1444       14820 :   MachineFunction &MF = DAG.getMachineFunction();
    1445       29640 :   FunctionType *FType = MF.getFunction()->getFunctionType();
    1446       14820 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1447       14820 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    1448             : 
    1449       29640 :   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
    1450           3 :     const Function *Fn = MF.getFunction();
    1451             :     DiagnosticInfoUnsupported NoGraphicsHSA(
    1452           9 :         *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
    1453           3 :     DAG.getContext()->diagnose(NoGraphicsHSA);
    1454           3 :     return DAG.getEntryNode();
    1455             :   }
    1456             : 
    1457             :   // Create stack objects that are used for emitting debugger prologue if
    1458             :   // "amdgpu-debugger-emit-prologue" attribute was specified.
    1459       14817 :   if (ST.debuggerEmitPrologue())
    1460           4 :     createDebuggerPrologueStackObjects(MF);
    1461             : 
    1462       14817 :   SmallVector<ISD::InputArg, 16> Splits;
    1463       29634 :   SmallVector<CCValAssign, 16> ArgLocs;
    1464       44451 :   BitVector Skipped(Ins.size());
    1465             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1466       29634 :                  *DAG.getContext());
    1467             : 
    1468       14817 :   bool IsShader = AMDGPU::isShader(CallConv);
    1469       14817 :   bool IsKernel = AMDGPU::isKernel(CallConv);
    1470       14817 :   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
    1471             : 
    1472       14817 :   if (!IsEntryFunc) {
    1473             :     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
    1474             :     // this when allocating argument fixed offsets.
    1475         659 :     CCInfo.AllocateStack(4, 4);
    1476             :   }
    1477             : 
    1478       14817 :   if (IsShader) {
    1479        1094 :     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
    1480             : 
    1481             :     // At least one interpolation mode must be enabled or else the GPU will
    1482             :     // hang.
    1483             :     //
    1484             :     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    1485             :     // set PSInputAddr, the user wants to enable some bits after the compilation
    1486             :     // based on run-time states. Since we can't know what the final PSInputEna
    1487             :     // will look like, so we shouldn't do anything here and the user should take
    1488             :     // responsibility for the correct programming.
    1489             :     //
    1490             :     // Otherwise, the following restrictions apply:
    1491             :     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    1492             :     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    1493             :     //   enabled too.
    1494         986 :     if (CallConv == CallingConv::AMDGPU_PS &&
    1495         756 :         ((Info->getPSInputAddr() & 0x7F) == 0 ||
    1496         320 :          ((Info->getPSInputAddr() & 0xF) == 0 &&
    1497           6 :           Info->isPSInputAllocated(11)))) {
    1498         124 :       CCInfo.AllocateReg(AMDGPU::VGPR0);
    1499         124 :       CCInfo.AllocateReg(AMDGPU::VGPR1);
    1500         248 :       Info->markPSInputAllocated(0);
    1501         124 :       Info->markPSInputEnabled(0);
    1502             :     }
    1503             : 
    1504             :     assert(!Info->hasDispatchPtr() &&
    1505             :            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
    1506             :            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
    1507             :            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
    1508             :            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
    1509             :            !Info->hasWorkItemIDZ());
    1510       14270 :   } else if (IsKernel) {
    1511             :     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
    1512             :   } else {
    1513        1977 :     Splits.append(Ins.begin(), Ins.end());
    1514             :   }
    1515             : 
    1516       14817 :   if (IsEntryFunc) {
    1517       14158 :     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    1518       14158 :     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
    1519             :   }
    1520             : 
    1521       14817 :   if (IsKernel) {
    1522       13611 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1523             :   } else {
    1524        1206 :     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
    1525        1206 :     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
    1526             :   }
    1527             : 
    1528       29634 :   SmallVector<SDValue, 16> Chains;
    1529             : 
    1530       66039 :   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    1531       72810 :     const ISD::InputArg &Arg = Ins[i];
    1532       73662 :     if (Skipped[i]) {
    1533        1704 :       InVals.push_back(DAG.getUNDEF(Arg.VT));
    1534       34416 :       continue;
    1535             :     }
    1536             : 
    1537       71106 :     CCValAssign &VA = ArgLocs[ArgIdx++];
    1538       35553 :     MVT VT = VA.getLocVT();
    1539             : 
    1540       69215 :     if (IsEntryFunc && VA.isMemLoc()) {
    1541       64162 :       VT = Ins[i].VT;
    1542       64162 :       EVT MemVT = VA.getLocVT();
    1543             : 
    1544       64162 :       const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
    1545       32081 :         VA.getLocMemOffset();
    1546       96243 :       Info->setABIArgOffset(Offset + MemVT.getStoreSize());
    1547             : 
    1548             :       // The first 36 bytes of the input buffer contains information about
    1549             :       // thread group and global sizes.
    1550             :       SDValue Arg = lowerKernargMemParameter(
    1551      160405 :         DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
    1552       64162 :       Chains.push_back(Arg.getValue(1));
    1553             : 
    1554             :       auto *ParamTy =
    1555      128324 :         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
    1556       45142 :       if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    1557       40524 :           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    1558             :         // On SI local pointers are just offsets into LDS, so they are always
    1559             :         // less than 16-bits.  On CI and newer they could potentially be
    1560             :         // real pointers, so we can't guarantee their size.
    1561         572 :         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
    1562        1716 :                           DAG.getValueType(MVT::i16));
    1563             :       }
    1564             : 
    1565       32081 :       InVals.push_back(Arg);
    1566       32081 :       continue;
    1567        5363 :     } else if (!IsEntryFunc && VA.isMemLoc()) {
    1568         184 :       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
    1569         184 :       InVals.push_back(Val);
    1570         368 :       if (!Arg.Flags.isByVal())
    1571         302 :         Chains.push_back(Val.getValue(1));
    1572         184 :       continue;
    1573             :     }
    1574             : 
    1575             :     assert(VA.isRegLoc() && "Parameter must be in a register!");
    1576             : 
    1577        3288 :     unsigned Reg = VA.getLocReg();
    1578        3288 :     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    1579        6576 :     EVT ValVT = VA.getValVT();
    1580             : 
    1581        3288 :     Reg = MF.addLiveIn(Reg, RC);
    1582        3288 :     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1583             : 
    1584             :     // If this is an 8 or 16-bit value, it is really passed promoted
    1585             :     // to 32 bits. Insert an assert[sz]ext to capture this, then
    1586             :     // truncate to the right size.
    1587        3288 :     switch (VA.getLocInfo()) {
    1588             :     case CCValAssign::Full:
    1589             :       break;
    1590           0 :     case CCValAssign::BCvt:
    1591           0 :       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
    1592           0 :       break;
    1593           7 :     case CCValAssign::SExt:
    1594           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
    1595          21 :                         DAG.getValueType(ValVT));
    1596           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1597           7 :       break;
    1598           7 :     case CCValAssign::ZExt:
    1599           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
    1600          21 :                         DAG.getValueType(ValVT));
    1601           7 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1602           7 :       break;
    1603           6 :     case CCValAssign::AExt:
    1604           6 :       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
    1605           6 :       break;
    1606           0 :     default:
    1607           0 :       llvm_unreachable("Unknown loc info!");
    1608             :     }
    1609             : 
    1610        4869 :     if (IsShader && Arg.VT.isVector()) {
    1611             :       // Build a vector from the registers
    1612         894 :       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
    1613         447 :       unsigned NumElements = ParamType->getVectorNumElements();
    1614             : 
    1615         894 :       SmallVector<SDValue, 4> Regs;
    1616         447 :       Regs.push_back(Val);
    1617        1870 :       for (unsigned j = 1; j != NumElements; ++j) {
    1618        2846 :         Reg = ArgLocs[ArgIdx++].getLocReg();
    1619        1423 :         Reg = MF.addLiveIn(Reg, RC);
    1620             : 
    1621        1423 :         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1622        1423 :         Regs.push_back(Copy);
    1623             :       }
    1624             : 
    1625             :       // Fill up the missing vector elements
    1626         447 :       NumElements = Arg.VT.getVectorNumElements() - NumElements;
    1627         447 :       Regs.append(NumElements, DAG.getUNDEF(VT));
    1628             : 
    1629        1341 :       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
    1630             :       continue;
    1631             :     }
    1632             : 
    1633        2841 :     InVals.push_back(Val);
    1634             :   }
    1635             : 
    1636       14817 :   if (!IsEntryFunc) {
    1637             :     // Special inputs come after user arguments.
    1638         659 :     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    1639             :   }
    1640             : 
    1641             :   // Start adding system SGPRs.
    1642       14817 :   if (IsEntryFunc) {
    1643       14158 :     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
    1644             :   } else {
    1645         659 :     CCInfo.AllocateReg(Info->getScratchRSrcReg());
    1646         659 :     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
    1647         659 :     CCInfo.AllocateReg(Info->getFrameOffsetReg());
    1648         659 :     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    1649             :   }
    1650             : 
    1651             :   auto &ArgUsageInfo =
    1652       14817 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1653       14817 :   ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
    1654             : 
    1655       14817 :   unsigned StackArgSize = CCInfo.getNextStackOffset();
    1656       29634 :   Info->setBytesInStackArgArea(StackArgSize);
    1657             : 
    1658       14817 :   return Chains.empty() ? Chain :
    1659       53214 :     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    1660             : }
    1661             : 
    1662             : // TODO: If return values can't fit in registers, we should return as many as
    1663             : // possible in registers before passing on stack.
    1664       15275 : bool SITargetLowering::CanLowerReturn(
    1665             :   CallingConv::ID CallConv,
    1666             :   MachineFunction &MF, bool IsVarArg,
    1667             :   const SmallVectorImpl<ISD::OutputArg> &Outs,
    1668             :   LLVMContext &Context) const {
    1669             :   // Replacing returns with sret/stack usage doesn't make sense for shaders.
    1670             :   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
    1671             :   // for shaders. Vector types should be explicitly handled by CC.
    1672       15275 :   if (AMDGPU::isEntryFunctionCC(CallConv))
    1673             :     return true;
    1674             : 
    1675        1114 :   SmallVector<CCValAssign, 16> RVLocs;
    1676        2228 :   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
    1677        1114 :   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
    1678             : }
    1679             : 
    1680             : SDValue
    1681       14773 : SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    1682             :                               bool isVarArg,
    1683             :                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    1684             :                               const SmallVectorImpl<SDValue> &OutVals,
    1685             :                               const SDLoc &DL, SelectionDAG &DAG) const {
    1686       14773 :   MachineFunction &MF = DAG.getMachineFunction();
    1687       14773 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    1688             : 
    1689       14773 :   if (AMDGPU::isKernel(CallConv)) {
    1690             :     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
    1691       13601 :                                              OutVals, DL, DAG);
    1692             :   }
    1693             : 
    1694        1172 :   bool IsShader = AMDGPU::isShader(CallConv);
    1695             : 
    1696        3516 :   Info->setIfReturnsVoid(Outs.size() == 0);
    1697        1172 :   bool IsWaveEnd = Info->returnsVoid() && IsShader;
    1698             : 
    1699        1172 :   SmallVector<ISD::OutputArg, 48> Splits;
    1700        2344 :   SmallVector<SDValue, 48> SplitVals;
    1701             : 
    1702             :   // Split vectors into their elements.
    1703        3249 :   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
    1704        1810 :     const ISD::OutputArg &Out = Outs[i];
    1705             : 
    1706        1406 :     if (IsShader && Out.VT.isVector()) {
    1707         114 :       MVT VT = Out.VT.getVectorElementType();
    1708         114 :       ISD::OutputArg NewOut = Out;
    1709         114 :       NewOut.Flags.setSplit();
    1710         114 :       NewOut.VT = VT;
    1711             : 
    1712             :       // We want the original number of vector elements here, e.g.
    1713             :       // three or five, not four or eight.
    1714         114 :       unsigned NumElements = Out.ArgVT.getVectorNumElements();
    1715             : 
    1716         546 :       for (unsigned j = 0; j != NumElements; ++j) {
    1717         864 :         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
    1718        1296 :                                    DAG.getConstant(j, DL, MVT::i32));
    1719         432 :         SplitVals.push_back(Elem);
    1720         432 :         Splits.push_back(NewOut);
    1721         432 :         NewOut.PartOffset += NewOut.VT.getStoreSize();
    1722             :       }
    1723             :     } else {
    1724        1582 :       SplitVals.push_back(OutVals[i]);
    1725         791 :       Splits.push_back(Out);
    1726             :     }
    1727             :   }
    1728             : 
    1729             :   // CCValAssign - represent the assignment of the return value to a location.
    1730        2344 :   SmallVector<CCValAssign, 48> RVLocs;
    1731             : 
    1732             :   // CCState - Info about the registers and stack slots.
    1733             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
    1734        2344 :                  *DAG.getContext());
    1735             : 
    1736             :   // Analyze outgoing return values.
    1737        1172 :   CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
    1738             : 
    1739        1172 :   SDValue Flag;
    1740        2344 :   SmallVector<SDValue, 48> RetOps;
    1741        1172 :   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    1742             : 
    1743             :   // Add return address for callable functions.
    1744        1172 :   if (!Info->isEntryFunction()) {
    1745        1246 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1746             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    1747        1246 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    1748             : 
    1749             :     // FIXME: Should be able to use a vreg here, but need a way to prevent it
    1750             :     // from being allcoated to a CSR.
    1751             : 
    1752             :     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    1753         623 :                                                 MVT::i64);
    1754             : 
    1755         623 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
    1756        1246 :     Flag = Chain.getValue(1);
    1757             : 
    1758         623 :     RetOps.push_back(PhysReturnAddrReg);
    1759             :   }
    1760             : 
    1761             :   // Copy the result values into the output registers.
    1762        1223 :   for (unsigned i = 0, realRVLocIdx = 0;
    1763        4790 :        i != RVLocs.size();
    1764             :        ++i, ++realRVLocIdx) {
    1765        2446 :     CCValAssign &VA = RVLocs[i];
    1766             :     assert(VA.isRegLoc() && "Can only return in registers!");
    1767             :     // TODO: Partially return in registers if return values don't fit.
    1768             : 
    1769        2446 :     SDValue Arg = SplitVals[realRVLocIdx];
    1770             : 
    1771             :     // Copied from other backends.
    1772        1223 :     switch (VA.getLocInfo()) {
    1773             :     case CCValAssign::Full:
    1774             :       break;
    1775           0 :     case CCValAssign::BCvt:
    1776           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    1777           0 :       break;
    1778           0 :     case CCValAssign::SExt:
    1779           0 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    1780           0 :       break;
    1781           0 :     case CCValAssign::ZExt:
    1782           0 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    1783           0 :       break;
    1784           3 :     case CCValAssign::AExt:
    1785           6 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    1786           3 :       break;
    1787           0 :     default:
    1788           0 :       llvm_unreachable("Unknown loc info!");
    1789             :     }
    1790             : 
    1791        1223 :     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
    1792        2446 :     Flag = Chain.getValue(1);
    1793        2446 :     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    1794             :   }
    1795             : 
    1796             :   // FIXME: Does sret work properly?
    1797        1172 :   if (!Info->isEntryFunction()) {
    1798             :     const SIRegisterInfo *TRI
    1799        1246 :       = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
    1800             :     const MCPhysReg *I =
    1801         623 :       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
    1802         623 :     if (I) {
    1803        1869 :       for (; *I; ++I) {
    1804        1246 :         if (AMDGPU::SReg_64RegClass.contains(*I))
    1805           0 :           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
    1806        1246 :         else if (AMDGPU::SReg_32RegClass.contains(*I))
    1807         623 :           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
    1808             :         else
    1809           0 :           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
    1810             :       }
    1811             :     }
    1812             :   }
    1813             : 
    1814             :   // Update chain and glue.
    1815        1172 :   RetOps[0] = Chain;
    1816        1172 :   if (Flag.getNode())
    1817         886 :     RetOps.push_back(Flag);
    1818             : 
    1819        1172 :   unsigned Opc = AMDGPUISD::ENDPGM;
    1820        1172 :   if (!IsWaveEnd)
    1821         886 :     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
    1822        3516 :   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
    1823             : }
    1824             : 
    1825         417 : SDValue SITargetLowering::LowerCallResult(
    1826             :     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
    1827             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1828             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    1829             :     SDValue ThisVal) const {
    1830         417 :   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
    1831             : 
    1832             :   // Assign locations to each value returned by this call.
    1833         834 :   SmallVector<CCValAssign, 16> RVLocs;
    1834             :   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    1835         834 :                  *DAG.getContext());
    1836         417 :   CCInfo.AnalyzeCallResult(Ins, RetCC);
    1837             : 
    1838             :   // Copy all of the result registers out of their specified physreg.
    1839        1056 :   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    1840         222 :     CCValAssign VA = RVLocs[i];
    1841         111 :     SDValue Val;
    1842             : 
    1843         111 :     if (VA.isRegLoc()) {
    1844         222 :       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
    1845         222 :       Chain = Val.getValue(1);
    1846         222 :       InFlag = Val.getValue(2);
    1847           0 :     } else if (VA.isMemLoc()) {
    1848           0 :       report_fatal_error("TODO: return values in memory");
    1849             :     } else
    1850             :       llvm_unreachable("unknown argument location type");
    1851             : 
    1852         111 :     switch (VA.getLocInfo()) {
    1853             :     case CCValAssign::Full:
    1854             :       break;
    1855           0 :     case CCValAssign::BCvt:
    1856           0 :       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
    1857           0 :       break;
    1858           7 :     case CCValAssign::ZExt:
    1859           7 :       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
    1860          28 :                         DAG.getValueType(VA.getValVT()));
    1861          14 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    1862           7 :       break;
    1863           7 :     case CCValAssign::SExt:
    1864           7 :       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
    1865          28 :                         DAG.getValueType(VA.getValVT()));
    1866          14 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    1867           7 :       break;
    1868           3 :     case CCValAssign::AExt:
    1869           6 :       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
    1870           3 :       break;
    1871           0 :     default:
    1872           0 :       llvm_unreachable("Unknown loc info!");
    1873             :     }
    1874             : 
    1875         111 :     InVals.push_back(Val);
    1876             :   }
    1877             : 
    1878         834 :   return Chain;
    1879             : }
    1880             : 
    1881             : // Add code to pass special inputs required depending on used features separate
    1882             : // from the explicit user arguments present in the IR.
    1883         449 : void SITargetLowering::passSpecialInputs(
    1884             :     CallLoweringInfo &CLI,
    1885             :     const SIMachineFunctionInfo &Info,
    1886             :     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    1887             :     SmallVectorImpl<SDValue> &MemOpChains,
    1888             :     SDValue Chain,
    1889             :     SDValue StackPtr) const {
    1890             :   // If we don't have a call site, this was a call inserted by
    1891             :   // legalization. These can never use special inputs.
    1892         898 :   if (!CLI.CS)
    1893           0 :     return;
    1894             : 
    1895         898 :   const Function *CalleeFunc = CLI.CS.getCalledFunction();
    1896             :   assert(CalleeFunc);
    1897             : 
    1898         449 :   SelectionDAG &DAG = CLI.DAG;
    1899         449 :   const SDLoc &DL = CLI.DL;
    1900             : 
    1901         449 :   const SISubtarget *ST = getSubtarget();
    1902         449 :   const SIRegisterInfo *TRI = ST->getRegisterInfo();
    1903             : 
    1904             :   auto &ArgUsageInfo =
    1905         449 :     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
    1906             :   const AMDGPUFunctionArgInfo &CalleeArgInfo
    1907         449 :     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
    1908             : 
    1909         449 :   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
    1910             : 
    1911             :   // TODO: Unify with private memory register handling. This is complicated by
    1912             :   // the fact that at least in kernels, the input argument is not necessarily
    1913             :   // in the same location as the input.
    1914         449 :   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    1915             :     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    1916             :     AMDGPUFunctionArgInfo::QUEUE_PTR,
    1917             :     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
    1918             :     AMDGPUFunctionArgInfo::DISPATCH_ID,
    1919             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    1920             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    1921             :     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
    1922             :     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
    1923             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
    1924             :     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
    1925             :     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    1926             :   };
    1927             : 
    1928        5388 :   for (auto InputID : InputRegs) {
    1929             :     const ArgDescriptor *OutgoingArg;
    1930             :     const TargetRegisterClass *ArgRC;
    1931             : 
    1932       14817 :     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
    1933        4939 :     if (!OutgoingArg)
    1934        4819 :       continue;
    1935             : 
    1936             :     const ArgDescriptor *IncomingArg;
    1937             :     const TargetRegisterClass *IncomingArgRC;
    1938             :     std::tie(IncomingArg, IncomingArgRC)
    1939         360 :       = CallerArgInfo.getPreloadedValue(InputID);
    1940             :     assert(IncomingArgRC == ArgRC);
    1941             : 
    1942             :     // All special arguments are ints for now.
    1943         240 :     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
    1944         120 :     SDValue InputReg;
    1945             : 
    1946         120 :     if (IncomingArg) {
    1947         111 :       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
    1948             :     } else {
    1949             :       // The implicit arg ptr is special because it doesn't have a corresponding
    1950             :       // input for kernels, and is computed from the kernarg segment pointer.
    1951             :       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    1952           9 :       InputReg = getImplicitArgPtr(DAG, DL);
    1953             :     }
    1954             : 
    1955         240 :     if (OutgoingArg->isRegister()) {
    1956         110 :       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
    1957             :     } else {
    1958             :       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
    1959             :                                               InputReg,
    1960          10 :                                               OutgoingArg->getStackOffset());
    1961          10 :       MemOpChains.push_back(ArgStore);
    1962             :     }
    1963             :   }
    1964             : }
    1965             : 
    1966             : static bool canGuaranteeTCO(CallingConv::ID CC) {
    1967          39 :   return CC == CallingConv::Fast;
    1968             : }
    1969             : 
    1970             : /// Return true if we might ever do TCO for calls with this calling convention.
    1971             : static bool mayTailCallThisCC(CallingConv::ID CC) {
    1972          41 :   switch (CC) {
    1973             :   case CallingConv::C:
    1974             :     return true;
    1975          39 :   default:
    1976          39 :     return canGuaranteeTCO(CC);
    1977             :   }
    1978             : }
    1979             : 
    1980          41 : bool SITargetLowering::isEligibleForTailCallOptimization(
    1981             :     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    1982             :     const SmallVectorImpl<ISD::OutputArg> &Outs,
    1983             :     const SmallVectorImpl<SDValue> &OutVals,
    1984             :     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    1985          39 :   if (!mayTailCallThisCC(CalleeCC))
    1986             :     return false;
    1987             : 
    1988          41 :   MachineFunction &MF = DAG.getMachineFunction();
    1989          41 :   const Function *CallerF = MF.getFunction();
    1990          41 :   CallingConv::ID CallerCC = CallerF->getCallingConv();
    1991          82 :   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    1992          41 :   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    1993             : 
    1994             :   // Kernels aren't callable, and don't have a live in return address so it
    1995             :   // doesn't make sense to do a tail call with entry functions.
    1996          41 :   if (!CallerPreserved)
    1997             :     return false;
    1998             : 
    1999          38 :   bool CCMatch = CallerCC == CalleeCC;
    2000             : 
    2001          38 :   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
    2002           0 :     if (canGuaranteeTCO(CalleeCC) && CCMatch)
    2003             :       return true;
    2004             :     return false;
    2005             :   }
    2006             : 
    2007             :   // TODO: Can we handle var args?
    2008          38 :   if (IsVarArg)
    2009             :     return false;
    2010             : 
    2011         134 :   for (const Argument &Arg : CallerF->args()) {
    2012          99 :     if (Arg.hasByValAttr())
    2013             :       return false;
    2014             :   }
    2015             : 
    2016          35 :   LLVMContext &Ctx = *DAG.getContext();
    2017             : 
    2018             :   // Check that the call results are passed in the same way.
    2019          35 :   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
    2020             :                                   CCAssignFnForCall(CalleeCC, IsVarArg),
    2021             :                                   CCAssignFnForCall(CallerCC, IsVarArg)))
    2022             :     return false;
    2023             : 
    2024             :   // The callee has to preserve all registers the caller needs to preserve.
    2025          35 :   if (!CCMatch) {
    2026           0 :     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    2027           0 :     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    2028             :       return false;
    2029             :   }
    2030             : 
    2031             :   // Nothing more to check if the callee is taking no arguments.
    2032          35 :   if (Outs.empty())
    2033             :     return true;
    2034             : 
    2035          33 :   SmallVector<CCValAssign, 16> ArgLocs;
    2036          66 :   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
    2037             : 
    2038          33 :   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
    2039             : 
    2040          33 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    2041             :   // If the stack arguments for this call do not fit into our own save area then
    2042             :   // the call cannot be made tail.
    2043             :   // TODO: Is this really necessary?
    2044          33 :   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
    2045             :     return false;
    2046             : 
    2047          30 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
    2048          30 :   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
    2049             : }
    2050             : 
    2051          13 : bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
    2052          13 :   if (!CI->isTailCall())
    2053             :     return false;
    2054             : 
    2055           4 :   const Function *ParentFn = CI->getParent()->getParent();
    2056           4 :   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
    2057             :     return false;
    2058             : 
    2059           2 :   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
    2060           2 :   return (Attr.getValueAsString() != "true");
    2061             : }
    2062             : 
    2063             : // The wave scratch offset register is used as the global base pointer.
    2064         455 : SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
    2065             :                                     SmallVectorImpl<SDValue> &InVals) const {
    2066         455 :   SelectionDAG &DAG = CLI.DAG;
    2067         455 :   const SDLoc &DL = CLI.DL;
    2068         455 :   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    2069         455 :   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
    2070         455 :   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
    2071         455 :   SDValue Chain = CLI.Chain;
    2072         455 :   SDValue Callee = CLI.Callee;
    2073         455 :   bool &IsTailCall = CLI.IsTailCall;
    2074         455 :   CallingConv::ID CallConv = CLI.CallConv;
    2075         455 :   bool IsVarArg = CLI.IsVarArg;
    2076         455 :   bool IsSibCall = false;
    2077         455 :   bool IsThisReturn = false;
    2078         455 :   MachineFunction &MF = DAG.getMachineFunction();
    2079             : 
    2080         455 :   if (IsVarArg) {
    2081             :     return lowerUnhandledCall(CLI, InVals,
    2082           2 :                               "unsupported call to variadic function ");
    2083             :   }
    2084             : 
    2085         904 :   if (!CLI.CS.getCalledFunction()) {
    2086             :     return lowerUnhandledCall(CLI, InVals,
    2087           8 :                               "unsupported indirect call to function ");
    2088             :   }
    2089             : 
    2090         450 :   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
    2091             :     return lowerUnhandledCall(CLI, InVals,
    2092           2 :                               "unsupported required tail call to function ");
    2093             :   }
    2094             : 
    2095             :   // The first 4 bytes are reserved for the callee's emergency stack slot.
    2096         449 :   const unsigned CalleeUsableStackOffset = 4;
    2097             : 
    2098         449 :   if (IsTailCall) {
    2099          41 :     IsTailCall = isEligibleForTailCallOptimization(
    2100             :       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
    2101          50 :     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
    2102           0 :       report_fatal_error("failed to perform tail call elimination on a call "
    2103             :                          "site marked musttail");
    2104             :     }
    2105             : 
    2106          41 :     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
    2107             : 
    2108             :     // A sibling call is one where we're under the usual C ABI and not planning
    2109             :     // to change that but can still do a tail call:
    2110          82 :     if (!TailCallOpt && IsTailCall)
    2111          32 :       IsSibCall = true;
    2112             : 
    2113             :     if (IsTailCall)
    2114             :       ++NumTailCalls;
    2115             :   }
    2116             : 
    2117         449 :   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
    2118             :     // FIXME: Remove this hack for function pointer types.
    2119         449 :     const GlobalValue *GV = GA->getGlobal();
    2120             :     assert(Callee.getValueType() == MVT::i32);
    2121         449 :     Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
    2122        1347 :                                   false, GA->getTargetFlags());
    2123             :   }
    2124             : 
    2125         449 :   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    2126             : 
    2127             :   // Analyze operands of the call, assigning locations to each operand.
    2128         449 :   SmallVector<CCValAssign, 16> ArgLocs;
    2129         898 :   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
    2130         449 :   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
    2131         449 :   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
    2132             : 
    2133             :   // Get a count of how many bytes are to be pushed on the stack.
    2134         449 :   unsigned NumBytes = CCInfo.getNextStackOffset();
    2135             : 
    2136         449 :   if (IsSibCall) {
    2137             :     // Since we're not changing the ABI to make this a tail call, the memory
    2138             :     // operands are already available in the caller's incoming argument space.
    2139          32 :     NumBytes = 0;
    2140             :   }
    2141             : 
    2142             :   // FPDiff is the byte offset of the call's argument area from the callee's.
    2143             :   // Stores to callee stack arguments will be placed in FixedStackSlots offset
    2144             :   // by this amount for a tail call. In a sibling call it must be 0 because the
    2145             :   // caller will deallocate the entire stack and the callee still expects its
    2146             :   // arguments to begin at SP+0. Completely unused for non-tail calls.
    2147         449 :   int32_t FPDiff = 0;
    2148         449 :   MachineFrameInfo &MFI = MF.getFrameInfo();
    2149         898 :   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    2150             : 
    2151             :   // Adjust the stack pointer for the new arguments...
    2152             :   // These operations are automatically eliminated by the prolog/epilog pass
    2153         449 :   if (!IsSibCall) {
    2154         417 :     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
    2155             : 
    2156         417 :     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
    2157             : 
    2158             :     // In the HSA case, this should be an identity copy.
    2159             :     SDValue ScratchRSrcReg
    2160         417 :       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
    2161         417 :     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
    2162             : 
    2163             :     // TODO: Don't hardcode these registers and get from the callee function.
    2164             :     SDValue ScratchWaveOffsetReg
    2165         417 :       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
    2166         417 :     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
    2167             :   }
    2168             : 
    2169             :   // Stack pointer relative accesses are done by changing the offset SGPR. This
    2170             :   // is just the VGPR offset component.
    2171         449 :   SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
    2172             : 
    2173         898 :   SmallVector<SDValue, 8> MemOpChains;
    2174         449 :   MVT PtrVT = MVT::i32;
    2175             : 
    2176             :   // Walk the register/memloc assignments, inserting copies/loads.
    2177        1946 :   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
    2178             :        ++i, ++realArgIdx) {
    2179        2096 :     CCValAssign &VA = ArgLocs[i];
    2180        2096 :     SDValue Arg = OutVals[realArgIdx];
    2181             : 
    2182             :     // Promote the value if needed.
    2183        1048 :     switch (VA.getLocInfo()) {
    2184             :     case CCValAssign::Full:
    2185             :       break;
    2186           0 :     case CCValAssign::BCvt:
    2187           0 :       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
    2188           0 :       break;
    2189          10 :     case CCValAssign::ZExt:
    2190          20 :       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    2191          10 :       break;
    2192          10 :     case CCValAssign::SExt:
    2193          20 :       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    2194          10 :       break;
    2195           4 :     case CCValAssign::AExt:
    2196           8 :       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    2197           4 :       break;
    2198           0 :     case CCValAssign::FPExt:
    2199           0 :       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
    2200           0 :       break;
    2201           0 :     default:
    2202           0 :       llvm_unreachable("Unknown loc info!");
    2203             :     }
    2204             : 
    2205        1048 :     if (VA.isRegLoc()) {
    2206        1972 :       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    2207             :     } else {
    2208             :       assert(VA.isMemLoc());
    2209             : 
    2210          62 :       SDValue DstAddr;
    2211          62 :       MachinePointerInfo DstInfo;
    2212             : 
    2213          62 :       unsigned LocMemOffset = VA.getLocMemOffset();
    2214          62 :       int32_t Offset = LocMemOffset;
    2215          62 :       SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
    2216         124 :       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
    2217             : 
    2218          62 :       if (IsTailCall) {
    2219          54 :         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
    2220          51 :         unsigned OpSize = Flags.isByVal() ?
    2221          51 :           Flags.getByValSize() : VA.getValVT().getStoreSize();
    2222             : 
    2223          27 :         Offset = Offset + FPDiff;
    2224          27 :         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
    2225             : 
    2226          27 :         DstAddr = DAG.getFrameIndex(FI, PtrVT);
    2227          54 :         DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
    2228          27 :         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
    2229             : 
    2230             :         // Make sure any stack arguments overlapping with where we're storing
    2231             :         // are loaded before this eventual operation. Otherwise they'll be
    2232             :         // clobbered.
    2233             : 
    2234             :         // FIXME: Why is this really necessary? This seems to just result in a
    2235             :         // lot of code to copy the stack and write them back to the same
    2236             :         // locations, which are supposed to be immutable?
    2237          27 :         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
    2238             :       } else {
    2239          35 :         DstAddr = PtrOff;
    2240          35 :         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
    2241             :       }
    2242             : 
    2243         186 :       if (Outs[i].Flags.isByVal()) {
    2244             :         SDValue SizeNode =
    2245          56 :             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
    2246             :         SDValue Cpy = DAG.getMemcpy(
    2247          56 :             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
    2248             :             /*isVol = */ false, /*AlwaysInline = */ true,
    2249             :             /*isTailCall = */ false,
    2250          84 :             DstInfo, MachinePointerInfo());
    2251             : 
    2252          28 :         MemOpChains.push_back(Cpy);
    2253             :       } else {
    2254          34 :         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
    2255          34 :         MemOpChains.push_back(Store);
    2256             :       }
    2257             :     }
    2258             :   }
    2259             : 
    2260             :   // Copy special input registers after user input arguments.
    2261         449 :   passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
    2262             : 
    2263         449 :   if (!MemOpChains.empty())
    2264         138 :     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    2265             : 
    2266             :   // Build a sequence of copy-to-reg nodes chained together with token chain
    2267             :   // and flag operands which copy the outgoing args into the appropriate regs.
    2268         449 :   SDValue InFlag;
    2269        3277 :   for (auto &RegToPass : RegsToPass) {
    2270        1930 :     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
    2271        1930 :                              RegToPass.second, InFlag);
    2272        3860 :     InFlag = Chain.getValue(1);
    2273             :   }
    2274             : 
    2275             : 
    2276         449 :   SDValue PhysReturnAddrReg;
    2277         449 :   if (IsTailCall) {
    2278             :     // Since the return is being combined with the call, we need to pass on the
    2279             :     // return address.
    2280             : 
    2281          64 :     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
    2282             :     SDValue ReturnAddrReg = CreateLiveInRegister(
    2283          64 :       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
    2284             : 
    2285          32 :     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
    2286          64 :                                         MVT::i64);
    2287          32 :     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
    2288          64 :     InFlag = Chain.getValue(1);
    2289             :   }
    2290             : 
    2291             :   // We don't usually want to end the call-sequence here because we would tidy
    2292             :   // the frame up *after* the call, however in the ABI-changing tail-call case
    2293             :   // we've carefully laid out the parameters so that when sp is reset they'll be
    2294             :   // in the correct location.
    2295         449 :   if (IsTailCall && !IsSibCall) {
    2296           0 :     Chain = DAG.getCALLSEQ_END(Chain,
    2297           0 :                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2298           0 :                                DAG.getTargetConstant(0, DL, MVT::i32),
    2299           0 :                                InFlag, DL);
    2300           0 :     InFlag = Chain.getValue(1);
    2301             :   }
    2302             : 
    2303         898 :   std::vector<SDValue> Ops;
    2304         449 :   Ops.push_back(Chain);
    2305         449 :   Ops.push_back(Callee);
    2306             : 
    2307         449 :   if (IsTailCall) {
    2308             :     // Each tail call may have to adjust the stack by a different amount, so
    2309             :     // this information must travel along with the operation for eventual
    2310             :     // consumption by emitEpilogue.
    2311          96 :     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
    2312             : 
    2313          32 :     Ops.push_back(PhysReturnAddrReg);
    2314             :   }
    2315             : 
    2316             :   // Add argument registers to the end of the list so that they are known live
    2317             :   // into the call.
    2318        3277 :   for (auto &RegToPass : RegsToPass) {
    2319        3860 :     Ops.push_back(DAG.getRegister(RegToPass.first,
    2320        3860 :                                   RegToPass.second.getValueType()));
    2321             :   }
    2322             : 
    2323             :   // Add a register mask operand representing the call-preserved registers.
    2324             : 
    2325         449 :   const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
    2326         449 :   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
    2327             :   assert(Mask && "Missing call preserved mask for calling convention");
    2328         898 :   Ops.push_back(DAG.getRegisterMask(Mask));
    2329             : 
    2330         449 :   if (InFlag.getNode())
    2331         449 :     Ops.push_back(InFlag);
    2332             : 
    2333         898 :   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    2334             : 
    2335             :   // If we're doing a tall call, use a TC_RETURN here rather than an
    2336             :   // actual call instruction.
    2337         449 :   if (IsTailCall) {
    2338          32 :     MFI.setHasTailCall();
    2339          32 :     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
    2340             :   }
    2341             : 
    2342             :   // Returns a chain and a flag for retval copy to use.
    2343         417 :   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
    2344         834 :   Chain = Call.getValue(0);
    2345         834 :   InFlag = Call.getValue(1);
    2346             : 
    2347         417 :   uint64_t CalleePopBytes = 0;
    2348         834 :   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
    2349         417 :                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
    2350        1251 :                              InFlag, DL);
    2351         417 :   if (!Ins.empty())
    2352         202 :     InFlag = Chain.getValue(1);
    2353             : 
    2354             :   // Handle result values, copying them out of physregs into vregs that we
    2355             :   // return.
    2356             :   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
    2357             :                          InVals, IsThisReturn,
    2358         417 :                          IsThisReturn ? OutVals[0] : SDValue());
    2359             : }
    2360             : 
    2361          27 : unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
    2362             :                                              SelectionDAG &DAG) const {
    2363          27 :   unsigned Reg = StringSwitch<unsigned>(RegName)
    2364          81 :     .Case("m0", AMDGPU::M0)
    2365          81 :     .Case("exec", AMDGPU::EXEC)
    2366          81 :     .Case("exec_lo", AMDGPU::EXEC_LO)
    2367          81 :     .Case("exec_hi", AMDGPU::EXEC_HI)
    2368          81 :     .Case("flat_scratch", AMDGPU::FLAT_SCR)
    2369          81 :     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
    2370          81 :     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
    2371          54 :     .Default(AMDGPU::NoRegister);
    2372             : 
    2373          27 :   if (Reg == AMDGPU::NoRegister) {
    2374           0 :     report_fatal_error(Twine("invalid register name \""
    2375           0 :                              + StringRef(RegName)  + "\"."));
    2376             : 
    2377             :   }
    2378             : 
    2379          30 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
    2380           3 :       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
    2381           1 :     report_fatal_error(Twine("invalid register \""
    2382           3 :                              + StringRef(RegName)  + "\" for subtarget."));
    2383             :   }
    2384             : 
    2385          26 :   switch (Reg) {
    2386          17 :   case AMDGPU::M0:
    2387             :   case AMDGPU::EXEC_LO:
    2388             :   case AMDGPU::EXEC_HI:
    2389             :   case AMDGPU::FLAT_SCR_LO:
    2390             :   case AMDGPU::FLAT_SCR_HI:
    2391          17 :     if (VT.getSizeInBits() == 32)
    2392             :       return Reg;
    2393             :     break;
    2394           9 :   case AMDGPU::EXEC:
    2395             :   case AMDGPU::FLAT_SCR:
    2396           9 :     if (VT.getSizeInBits() == 64)
    2397             :       return Reg;
    2398             :     break;
    2399           0 :   default:
    2400           0 :     llvm_unreachable("missing register type checking");
    2401             :   }
    2402             : 
    2403           2 :   report_fatal_error(Twine("invalid type for register \""
    2404           6 :                            + StringRef(RegName) + "\"."));
    2405             : }
    2406             : 
    2407             : // If kill is not the last instruction, split the block so kill is always a
    2408             : // proper terminator.
    2409          33 : MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
    2410             :                                                     MachineBasicBlock *BB) const {
    2411          33 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2412             : 
    2413          33 :   MachineBasicBlock::iterator SplitPoint(&MI);
    2414          33 :   ++SplitPoint;
    2415             : 
    2416          66 :   if (SplitPoint == BB->end()) {
    2417             :     // Don't bother with a new block.
    2418          12 :     MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
    2419           4 :     return BB;
    2420             :   }
    2421             : 
    2422          29 :   MachineFunction *MF = BB->getParent();
    2423             :   MachineBasicBlock *SplitBB
    2424          29 :     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
    2425             : 
    2426          87 :   MF->insert(++MachineFunction::iterator(BB), SplitBB);
    2427          87 :   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
    2428             : 
    2429          29 :   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
    2430          29 :   BB->addSuccessor(SplitBB);
    2431             : 
    2432          87 :   MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
    2433          29 :   return SplitBB;
    2434             : }
    2435             : 
    2436             : // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
    2437             : // wavefront. If the value is uniform and just happens to be in a VGPR, this
    2438             : // will only do one iteration. In the worst case, this will loop 64 times.
    2439             : //
    2440             : // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
    2441          33 : static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
    2442             :   const SIInstrInfo *TII,
    2443             :   MachineRegisterInfo &MRI,
    2444             :   MachineBasicBlock &OrigBB,
    2445             :   MachineBasicBlock &LoopBB,
    2446             :   const DebugLoc &DL,
    2447             :   const MachineOperand &IdxReg,
    2448             :   unsigned InitReg,
    2449             :   unsigned ResultReg,
    2450             :   unsigned PhiReg,
    2451             :   unsigned InitSaveExecReg,
    2452             :   int Offset,
    2453             :   bool UseGPRIdxMode) {
    2454          33 :   MachineBasicBlock::iterator I = LoopBB.begin();
    2455             : 
    2456          33 :   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2457          33 :   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2458          33 :   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2459          33 :   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2460             : 
    2461          66 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
    2462          33 :     .addReg(InitReg)
    2463          33 :     .addMBB(&OrigBB)
    2464          33 :     .addReg(ResultReg)
    2465          33 :     .addMBB(&LoopBB);
    2466             : 
    2467          66 :   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
    2468          33 :     .addReg(InitSaveExecReg)
    2469          33 :     .addMBB(&OrigBB)
    2470          33 :     .addReg(NewExec)
    2471          33 :     .addMBB(&LoopBB);
    2472             : 
    2473             :   // Read the next variant <- also loop target.
    2474          99 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
    2475          66 :     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    2476             : 
    2477             :   // Compare the just read M0 value to all possible Idx values.
    2478          99 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
    2479          33 :     .addReg(CurrentIdxReg)
    2480          33 :     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
    2481             : 
    2482          33 :   if (UseGPRIdxMode) {
    2483             :     unsigned IdxReg;
    2484          16 :     if (Offset == 0) {
    2485             :       IdxReg = CurrentIdxReg;
    2486             :     } else {
    2487           6 :       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2488          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
    2489           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2490          12 :         .addImm(Offset);
    2491             :     }
    2492             : 
    2493             :     MachineInstr *SetIdx =
    2494          48 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
    2495          16 :       .addReg(IdxReg, RegState::Kill);
    2496          16 :     SetIdx->getOperand(2).setIsUndef();
    2497             :   } else {
    2498             :     // Move index from VCC into M0
    2499          17 :     if (Offset == 0) {
    2500          33 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2501          11 :         .addReg(CurrentIdxReg, RegState::Kill);
    2502             :     } else {
    2503          18 :       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2504           6 :         .addReg(CurrentIdxReg, RegState::Kill)
    2505          12 :         .addImm(Offset);
    2506             :     }
    2507             :   }
    2508             : 
    2509             :   // Update EXEC, save the original EXEC value to VCC.
    2510          99 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
    2511          33 :     .addReg(CondReg, RegState::Kill);
    2512             : 
    2513          33 :   MRI.setSimpleHint(NewExec, CondReg);
    2514             : 
    2515             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    2516             :   MachineInstr *InsertPt =
    2517          99 :     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    2518          33 :     .addReg(AMDGPU::EXEC)
    2519          33 :     .addReg(NewExec);
    2520             : 
    2521             :   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
    2522             :   // s_cbranch_scc0?
    2523             : 
    2524             :   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
    2525          99 :   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    2526          33 :     .addMBB(&LoopBB);
    2527             : 
    2528          66 :   return InsertPt->getIterator();
    2529             : }
    2530             : 
    2531             : // This has slightly sub-optimal regalloc when the source vector is killed by
    2532             : // the read. The register allocator does not understand that the kill is
    2533             : // per-workitem, so is kept alive for the whole loop so we end up not re-using a
    2534             : // subregister from it, using 1 more VGPR than necessary. This was saved when
    2535             : // this was expanded after register allocation.
    2536          33 : static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
    2537             :                                                   MachineBasicBlock &MBB,
    2538             :                                                   MachineInstr &MI,
    2539             :                                                   unsigned InitResultReg,
    2540             :                                                   unsigned PhiReg,
    2541             :                                                   int Offset,
    2542             :                                                   bool UseGPRIdxMode) {
    2543          33 :   MachineFunction *MF = MBB.getParent();
    2544          33 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2545          33 :   const DebugLoc &DL = MI.getDebugLoc();
    2546          33 :   MachineBasicBlock::iterator I(&MI);
    2547             : 
    2548          33 :   unsigned DstReg = MI.getOperand(0).getReg();
    2549          33 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2550          33 :   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    2551             : 
    2552          66 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
    2553             : 
    2554             :   // Save the EXEC mask
    2555          99 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
    2556          33 :     .addReg(AMDGPU::EXEC);
    2557             : 
    2558             :   // To insert the loop we need to split the block. Move everything after this
    2559             :   // point to a new block, and insert a new empty block between the two.
    2560          33 :   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    2561          33 :   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    2562          33 :   MachineFunction::iterator MBBI(MBB);
    2563          33 :   ++MBBI;
    2564             : 
    2565          33 :   MF->insert(MBBI, LoopBB);
    2566          33 :   MF->insert(MBBI, RemainderBB);
    2567             : 
    2568          33 :   LoopBB->addSuccessor(LoopBB);
    2569          33 :   LoopBB->addSuccessor(RemainderBB);
    2570             : 
    2571             :   // Move the rest of the block into a new block.
    2572          33 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    2573          99 :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    2574             : 
    2575          33 :   MBB.addSuccessor(LoopBB);
    2576             : 
    2577          33 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2578             : 
    2579             :   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
    2580             :                                       InitResultReg, DstReg, PhiReg, TmpExec,
    2581          33 :                                       Offset, UseGPRIdxMode);
    2582             : 
    2583          33 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    2584          99 :   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    2585          33 :     .addReg(SaveExec);
    2586             : 
    2587          33 :   return InsPt;
    2588             : }
    2589             : 
    2590             : // Returns subreg index, offset
    2591             : static std::pair<unsigned, int>
    2592             : computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
    2593             :                             const TargetRegisterClass *SuperRC,
    2594             :                             unsigned VecReg,
    2595             :                             int Offset) {
    2596         344 :   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
    2597             : 
    2598             :   // Skip out of bounds offsets, or else we would end up using an undefined
    2599             :   // register.
    2600         172 :   if (Offset >= NumElts || Offset < 0)
    2601             :     return std::make_pair(AMDGPU::sub0, Offset);
    2602             : 
    2603         396 :   return std::make_pair(AMDGPU::sub0 + Offset, 0);
    2604             : }
    2605             : 
    2606             : // Return true if the index is an SGPR and was set.
    2607         172 : static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
    2608             :                                  MachineRegisterInfo &MRI,
    2609             :                                  MachineInstr &MI,
    2610             :                                  int Offset,
    2611             :                                  bool UseGPRIdxMode,
    2612             :                                  bool IsIndirectSrc) {
    2613         172 :   MachineBasicBlock *MBB = MI.getParent();
    2614         172 :   const DebugLoc &DL = MI.getDebugLoc();
    2615         172 :   MachineBasicBlock::iterator I(&MI);
    2616             : 
    2617         172 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2618         344 :   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
    2619             : 
    2620             :   assert(Idx->getReg() != AMDGPU::NoRegister);
    2621             : 
    2622         344 :   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
    2623             :     return false;
    2624             : 
    2625         139 :   if (UseGPRIdxMode) {
    2626          42 :     unsigned IdxMode = IsIndirectSrc ?
    2627             :       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
    2628          42 :     if (Offset == 0) {
    2629             :       MachineInstr *SetOn =
    2630          56 :           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2631          28 :               .add(*Idx)
    2632          56 :               .addImm(IdxMode);
    2633             : 
    2634          28 :       SetOn->getOperand(3).setIsUndef();
    2635             :     } else {
    2636          14 :       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    2637          28 :       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
    2638          14 :           .add(*Idx)
    2639          28 :           .addImm(Offset);
    2640             :       MachineInstr *SetOn =
    2641          42 :         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2642          14 :         .addReg(Tmp, RegState::Kill)
    2643          28 :         .addImm(IdxMode);
    2644             : 
    2645          14 :       SetOn->getOperand(3).setIsUndef();
    2646             :     }
    2647             : 
    2648             :     return true;
    2649             :   }
    2650             : 
    2651          97 :   if (Offset == 0) {
    2652         249 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2653          83 :       .add(*Idx);
    2654             :   } else {
    2655          28 :     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    2656          14 :       .add(*Idx)
    2657          28 :       .addImm(Offset);
    2658             :   }
    2659             : 
    2660             :   return true;
    2661             : }
    2662             : 
    2663             : // Control flow needs to be inserted if indexing with a VGPR.
    2664          78 : static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
    2665             :                                           MachineBasicBlock &MBB,
    2666             :                                           const SISubtarget &ST) {
    2667          78 :   const SIInstrInfo *TII = ST.getInstrInfo();
    2668          78 :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2669          78 :   MachineFunction *MF = MBB.getParent();
    2670          78 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2671             : 
    2672          78 :   unsigned Dst = MI.getOperand(0).getReg();
    2673          78 :   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
    2674          78 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2675             : 
    2676          78 :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
    2677             : 
    2678             :   unsigned SubReg;
    2679             :   std::tie(SubReg, Offset)
    2680         312 :     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
    2681             : 
    2682         156 :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    2683             : 
    2684          78 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
    2685          65 :     MachineBasicBlock::iterator I(&MI);
    2686          65 :     const DebugLoc &DL = MI.getDebugLoc();
    2687             : 
    2688          65 :     if (UseGPRIdxMode) {
    2689             :       // TODO: Look at the uses to avoid the copy. This may require rescheduling
    2690             :       // to avoid interfering with other uses, so probably requires a new
    2691             :       // optimization pass.
    2692          66 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2693          22 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2694          22 :         .addReg(SrcReg, RegState::Implicit)
    2695          22 :         .addReg(AMDGPU::M0, RegState::Implicit);
    2696          44 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2697             :     } else {
    2698         129 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2699          43 :         .addReg(SrcReg, RegState::Undef, SubReg)
    2700          43 :         .addReg(SrcReg, RegState::Implicit);
    2701             :     }
    2702             : 
    2703          65 :     MI.eraseFromParent();
    2704             : 
    2705             :     return &MBB;
    2706             :   }
    2707             : 
    2708          13 :   const DebugLoc &DL = MI.getDebugLoc();
    2709          13 :   MachineBasicBlock::iterator I(&MI);
    2710             : 
    2711          13 :   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2712          13 :   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2713             : 
    2714          26 :   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
    2715             : 
    2716          13 :   if (UseGPRIdxMode) {
    2717          12 :     MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2718           6 :       .addImm(0) // Reset inside loop.
    2719           6 :       .addImm(VGPRIndexMode::SRC0_ENABLE);
    2720          12 :     SetOn->getOperand(3).setIsUndef();
    2721             : 
    2722             :     // Disable again after the loop.
    2723          18 :     BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2724             :   }
    2725             : 
    2726          13 :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
    2727          13 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    2728             : 
    2729          13 :   if (UseGPRIdxMode) {
    2730          18 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    2731           6 :       .addReg(SrcReg, RegState::Undef, SubReg)
    2732           6 :       .addReg(SrcReg, RegState::Implicit)
    2733           6 :       .addReg(AMDGPU::M0, RegState::Implicit);
    2734             :   } else {
    2735          21 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    2736           7 :       .addReg(SrcReg, RegState::Undef, SubReg)
    2737           7 :       .addReg(SrcReg, RegState::Implicit);
    2738             :   }
    2739             : 
    2740          13 :   MI.eraseFromParent();
    2741             : 
    2742          13 :   return LoopBB;
    2743             : }
    2744             : 
    2745             : static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
    2746             :                                  const TargetRegisterClass *VecRC) {
    2747         128 :   switch (TRI.getRegSizeInBits(*VecRC)) {
    2748             :   case 32: // 4 bytes
    2749             :     return AMDGPU::V_MOVRELD_B32_V1;
    2750           6 :   case 64: // 8 bytes
    2751             :     return AMDGPU::V_MOVRELD_B32_V2;
    2752          44 :   case 128: // 16 bytes
    2753             :     return AMDGPU::V_MOVRELD_B32_V4;
    2754          10 :   case 256: // 32 bytes
    2755             :     return AMDGPU::V_MOVRELD_B32_V8;
    2756           4 :   case 512: // 64 bytes
    2757             :     return AMDGPU::V_MOVRELD_B32_V16;
    2758           0 :   default:
    2759           0 :     llvm_unreachable("unsupported size for MOVRELD pseudos");
    2760             :   }
    2761             : }
    2762             : 
    2763          94 : static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
    2764             :                                           MachineBasicBlock &MBB,
    2765             :                                           const SISubtarget &ST) {
    2766          94 :   const SIInstrInfo *TII = ST.getInstrInfo();
    2767          94 :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    2768          94 :   MachineFunction *MF = MBB.getParent();
    2769          94 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    2770             : 
    2771          94 :   unsigned Dst = MI.getOperand(0).getReg();
    2772          94 :   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    2773          94 :   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    2774          94 :   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    2775          94 :   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    2776         188 :   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
    2777             : 
    2778             :   // This can be an immediate, but will be folded later.
    2779             :   assert(Val->getReg());
    2780             : 
    2781             :   unsigned SubReg;
    2782         282 :   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
    2783             :                                                          SrcVec->getReg(),
    2784         188 :                                                          Offset);
    2785         188 :   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
    2786             : 
    2787          94 :   if (Idx->getReg() == AMDGPU::NoRegister) {
    2788           0 :     MachineBasicBlock::iterator I(&MI);
    2789           0 :     const DebugLoc &DL = MI.getDebugLoc();
    2790             : 
    2791             :     assert(Offset == 0);
    2792             : 
    2793           0 :     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
    2794           0 :         .add(*SrcVec)
    2795           0 :         .add(*Val)
    2796           0 :         .addImm(SubReg);
    2797             : 
    2798           0 :     MI.eraseFromParent();
    2799             :     return &MBB;
    2800             :   }
    2801             : 
    2802          94 :   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
    2803          74 :     MachineBasicBlock::iterator I(&MI);
    2804          74 :     const DebugLoc &DL = MI.getDebugLoc();
    2805             : 
    2806          74 :     if (UseGPRIdxMode) {
    2807          60 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    2808          20 :           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
    2809          20 :           .add(*Val)
    2810          20 :           .addReg(Dst, RegState::ImplicitDefine)
    2811          20 :           .addReg(SrcVec->getReg(), RegState::Implicit)
    2812          20 :           .addReg(AMDGPU::M0, RegState::Implicit);
    2813             : 
    2814          40 :       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2815             :     } else {
    2816         162 :       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    2817             : 
    2818         108 :       BuildMI(MBB, I, DL, MovRelDesc)
    2819          54 :           .addReg(Dst, RegState::Define)
    2820          54 :           .addReg(SrcVec->getReg())
    2821          54 :           .add(*Val)
    2822         108 :           .addImm(SubReg - AMDGPU::sub0);
    2823             :     }
    2824             : 
    2825          74 :     MI.eraseFromParent();
    2826             :     return &MBB;
    2827             :   }
    2828             : 
    2829          20 :   if (Val->isReg())
    2830          20 :     MRI.clearKillFlags(Val->getReg());
    2831             : 
    2832          20 :   const DebugLoc &DL = MI.getDebugLoc();
    2833             : 
    2834          20 :   if (UseGPRIdxMode) {
    2835          10 :     MachineBasicBlock::iterator I(&MI);
    2836             : 
    2837          20 :     MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
    2838          10 :       .addImm(0) // Reset inside loop.
    2839          10 :       .addImm(VGPRIndexMode::DST_ENABLE);
    2840          20 :     SetOn->getOperand(3).setIsUndef();
    2841             : 
    2842             :     // Disable again after the loop.
    2843          30 :     BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
    2844             :   }
    2845             : 
    2846          20 :   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
    2847             : 
    2848             :   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
    2849          20 :                               Offset, UseGPRIdxMode);
    2850          20 :   MachineBasicBlock *LoopBB = InsPt->getParent();
    2851             : 
    2852          20 :   if (UseGPRIdxMode) {
    2853          30 :     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
    2854          10 :         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
    2855          10 :         .add(*Val)                               // src0
    2856          10 :         .addReg(Dst, RegState::ImplicitDefine)
    2857          10 :         .addReg(PhiReg, RegState::Implicit)
    2858          10 :         .addReg(AMDGPU::M0, RegState::Implicit);
    2859             :   } else {
    2860          30 :     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
    2861             : 
    2862          20 :     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
    2863          10 :         .addReg(Dst, RegState::Define)
    2864          10 :         .addReg(PhiReg)
    2865          10 :         .add(*Val)
    2866          20 :         .addImm(SubReg - AMDGPU::sub0);
    2867             :   }
    2868             : 
    2869          20 :   MI.eraseFromParent();
    2870             : 
    2871          20 :   return LoopBB;
    2872             : }
    2873             : 
    2874        9533 : MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    2875             :   MachineInstr &MI, MachineBasicBlock *BB) const {
    2876             : 
    2877        9533 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    2878        9533 :   MachineFunction *MF = BB->getParent();
    2879        9533 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    2880             : 
    2881        9533 :   if (TII->isMIMG(MI)) {
    2882         314 :       if (!MI.memoperands_empty())
    2883             :         return BB;
    2884             :     // Add a memoperand for mimg instructions so that they aren't assumed to
    2885             :     // be ordered memory instuctions.
    2886             : 
    2887         628 :     MachinePointerInfo PtrInfo(MFI->getImagePSV());
    2888         314 :     MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
    2889         314 :     if (MI.mayStore())
    2890             :       Flags |= MachineMemOperand::MOStore;
    2891             : 
    2892         314 :     if (MI.mayLoad())
    2893             :       Flags |= MachineMemOperand::MOLoad;
    2894             : 
    2895         314 :     auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
    2896         314 :     MI.addMemOperand(*MF, MMO);
    2897         314 :     return BB;
    2898             :   }
    2899             : 
    2900       18438 :   switch (MI.getOpcode()) {
    2901        7539 :   case AMDGPU::SI_INIT_M0:
    2902       15078 :     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
    2903       30156 :             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    2904       15078 :         .add(MI.getOperand(0));
    2905        7539 :     MI.eraseFromParent();
    2906        7539 :     return BB;
    2907             : 
    2908           2 :   case AMDGPU::SI_INIT_EXEC:
    2909             :     // This should be before all vector instructions.
    2910           6 :     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    2911           6 :             AMDGPU::EXEC)
    2912           4 :         .addImm(MI.getOperand(0).getImm());
    2913           2 :     MI.eraseFromParent();
    2914           2 :     return BB;
    2915             : 
    2916           4 :   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    2917             :     // Extract the thread count from an SGPR input and set EXEC accordingly.
    2918             :     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    2919             :     //
    2920             :     // S_BFE_U32 count, input, {shift, 7}
    2921             :     // S_BFM_B64 exec, count, 0
    2922             :     // S_CMP_EQ_U32 count, 64
    2923             :     // S_CMOV_B64 exec, -1
    2924           8 :     MachineInstr *FirstMI = &*BB->begin();
    2925           4 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    2926           4 :     unsigned InputReg = MI.getOperand(0).getReg();
    2927           4 :     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    2928           4 :     bool Found = false;
    2929             : 
    2930             :     // Move the COPY of the input reg to the beginning, so that we can use it.
    2931          20 :     for (auto I = BB->begin(); I != &MI; I++) {
    2932          30 :       if (I->getOpcode() != TargetOpcode::COPY ||
    2933          10 :           I->getOperand(0).getReg() != InputReg)
    2934             :         continue;
    2935             : 
    2936           4 :       if (I == FirstMI) {
    2937           0 :         FirstMI = &*++BB->begin();
    2938             :       } else {
    2939           4 :         I->removeFromParent();
    2940          12 :         BB->insert(FirstMI, &*I);
    2941             :       }
    2942             :       Found = true;
    2943             :       break;
    2944             :     }
    2945             :     assert(Found);
    2946             :     (void)Found;
    2947             : 
    2948             :     // This should be before all vector instructions.
    2949          24 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
    2950           4 :         .addReg(InputReg)
    2951           8 :         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
    2952          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
    2953           8 :             AMDGPU::EXEC)
    2954           4 :         .addReg(CountReg)
    2955           4 :         .addImm(0);
    2956          24 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
    2957           4 :         .addReg(CountReg, RegState::Kill)
    2958           4 :         .addImm(64);
    2959          20 :     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
    2960           8 :             AMDGPU::EXEC)
    2961           4 :         .addImm(-1);
    2962           4 :     MI.eraseFromParent();
    2963           4 :     return BB;
    2964             :   }
    2965             : 
    2966          61 :   case AMDGPU::GET_GROUPSTATICSIZE: {
    2967         183 :     DebugLoc DL = MI.getDebugLoc();
    2968         122 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
    2969         122 :         .add(MI.getOperand(0))
    2970         122 :         .addImm(MFI->getLDSSize());
    2971          61 :     MI.eraseFromParent();
    2972          61 :     return BB;
    2973             :   }
    2974          78 :   case AMDGPU::SI_INDIRECT_SRC_V1:
    2975             :   case AMDGPU::SI_INDIRECT_SRC_V2:
    2976             :   case AMDGPU::SI_INDIRECT_SRC_V4:
    2977             :   case AMDGPU::SI_INDIRECT_SRC_V8:
    2978             :   case AMDGPU::SI_INDIRECT_SRC_V16:
    2979          78 :     return emitIndirectSrc(MI, *BB, *getSubtarget());
    2980          94 :   case AMDGPU::SI_INDIRECT_DST_V1:
    2981             :   case AMDGPU::SI_INDIRECT_DST_V2:
    2982             :   case AMDGPU::SI_INDIRECT_DST_V4:
    2983             :   case AMDGPU::SI_INDIRECT_DST_V8:
    2984             :   case AMDGPU::SI_INDIRECT_DST_V16:
    2985          94 :     return emitIndirectDst(MI, *BB, *getSubtarget());
    2986          33 :   case AMDGPU::SI_KILL:
    2987          33 :     return splitKillBlock(MI, BB);
    2988          49 :   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    2989          49 :     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    2990             : 
    2991          49 :     unsigned Dst = MI.getOperand(0).getReg();
    2992          49 :     unsigned Src0 = MI.getOperand(1).getReg();
    2993          49 :     unsigned Src1 = MI.getOperand(2).getReg();
    2994          49 :     const DebugLoc &DL = MI.getDebugLoc();
    2995          49 :     unsigned SrcCond = MI.getOperand(3).getReg();
    2996             : 
    2997          49 :     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2998          49 :     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    2999             : 
    3000         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    3001          49 :       .addReg(Src0, 0, AMDGPU::sub0)
    3002          49 :       .addReg(Src1, 0, AMDGPU::sub0)
    3003          49 :       .addReg(SrcCond);
    3004         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    3005          49 :       .addReg(Src0, 0, AMDGPU::sub1)
    3006          49 :       .addReg(Src1, 0, AMDGPU::sub1)
    3007          49 :       .addReg(SrcCond);
    3008             : 
    3009         147 :     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
    3010          49 :       .addReg(DstLo)
    3011          49 :       .addImm(AMDGPU::sub0)
    3012          49 :       .addReg(DstHi)
    3013          49 :       .addImm(AMDGPU::sub1);
    3014          49 :     MI.eraseFromParent();
    3015          49 :     return BB;
    3016             :   }
    3017          76 :   case AMDGPU::SI_BR_UNDEF: {
    3018          76 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3019          76 :     const DebugLoc &DL = MI.getDebugLoc();
    3020         228 :     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
    3021         152 :                            .add(MI.getOperand(0));
    3022         152 :     Br->getOperand(1).setIsUndef(true); // read undef SCC
    3023          76 :     MI.eraseFromParent();
    3024          76 :     return BB;
    3025             :   }
    3026         834 :   case AMDGPU::ADJCALLSTACKUP:
    3027             :   case AMDGPU::ADJCALLSTACKDOWN: {
    3028         834 :     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    3029         834 :     MachineInstrBuilder MIB(*MF, &MI);
    3030         834 :     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
    3031         834 :         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
    3032             :     return BB;
    3033             :   }
    3034         449 :   case AMDGPU::SI_CALL_ISEL:
    3035             :   case AMDGPU::SI_TCRETURN_ISEL: {
    3036         449 :     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    3037         449 :     const DebugLoc &DL = MI.getDebugLoc();
    3038         449 :     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
    3039             : 
    3040         449 :     MachineRegisterInfo &MRI = MF->getRegInfo();
    3041         449 :     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
    3042         449 :     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
    3043             :     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
    3044             : 
    3045         449 :     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
    3046             : 
    3047         449 :     MachineInstrBuilder MIB;
    3048         898 :     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
    3049        1251 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
    3050         834 :         .add(MI.getOperand(0))
    3051         417 :         .addGlobalAddress(G);
    3052             :     } else {
    3053          96 :       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
    3054          64 :         .add(MI.getOperand(0))
    3055          32 :         .addGlobalAddress(G);
    3056             : 
    3057             :       // There is an additional imm operand for tcreturn, but it should be in the
    3058             :       // right place already.
    3059             :     }
    3060             : 
    3061        3003 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
    3062        7662 :       MIB.add(MI.getOperand(I));
    3063             : 
    3064        1347 :     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3065         449 :     MI.eraseFromParent();
    3066             :     return BB;
    3067             :   }
    3068           0 :   default:
    3069           0 :     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    3070             :   }
    3071             : }
    3072             : 
    3073        3076 : bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    3074             :   // This currently forces unfolding various combinations of fsub into fma with
    3075             :   // free fneg'd operands. As long as we have fast FMA (controlled by
    3076             :   // isFMAFasterThanFMulAndFAdd), we should perform these.
    3077             : 
    3078             :   // When fma is quarter rate, for f64 where add / sub are at best half rate,
    3079             :   // most of these combines appear to be cycle neutral but save on instruction
    3080             :   // count / code size.
    3081        3076 :   return true;
    3082             : }
    3083             : 
    3084       18213 : EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
    3085             :                                          EVT VT) const {
    3086       18213 :   if (!VT.isVector()) {
    3087       18149 :     return MVT::i1;
    3088             :   }
    3089         128 :   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
    3090             : }
    3091             : 
    3092       86518 : MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
    3093             :   // TODO: Should i16 be used always if legal? For now it would force VALU
    3094             :   // shifts.
    3095      173036 :   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
    3096             : }
    3097             : 
    3098             : // Answering this is somewhat tricky and depends on the specific device which
    3099             : // have different rates for fma or all f64 operations.
    3100             : //
    3101             : // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
    3102             : // regardless of which device (although the number of cycles differs between
    3103             : // devices), so it is always profitable for f64.
    3104             : //
    3105             : // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
    3106             : // only on full rate devices. Normally, we should prefer selecting v_mad_f32
    3107             : // which we can always do even without fused FP ops since it returns the same
    3108             : // result as the separate operations and since it is always full
    3109             : // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
    3110             : // however does not support denormals, so we do report fma as faster if we have
    3111             : // a fast fma device and require denormals.
    3112             : //
    3113        9024 : bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    3114        9024 :   VT = VT.getScalarType();
    3115             : 
    3116        9024 :   switch (VT.getSimpleVT().SimpleTy) {
    3117        6549 :   case MVT::f32:
    3118             :     // This is as fast on some subtargets. However, we always have full rate f32
    3119             :     // mad available which returns the same result as the separate operations
    3120             :     // which we should prefer over fma. We can't use this if we want to support
    3121             :     // denormals, so only report this in these cases.
    3122        6549 :     return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
    3123             :   case MVT::f64:
    3124             :     return true;
    3125        1605 :   case MVT::f16:
    3126        1605 :     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
    3127             :   default:
    3128             :     break;
    3129             :   }
    3130             : 
    3131           0 :   return false;
    3132             : }
    3133             : 
    3134             : //===----------------------------------------------------------------------===//
    3135             : // Custom DAG Lowering Operations
    3136             : //===----------------------------------------------------------------------===//
    3137             : 
    3138      168726 : SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    3139      337452 :   switch (Op.getOpcode()) {
    3140       16731 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    3141        1500 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    3142       75121 :   case ISD::LOAD: {
    3143       75121 :     SDValue Result = LowerLOAD(Op, DAG);
    3144             :     assert((!Result.getNode() ||
    3145             :             Result.getNode()->getNumValues() == 2) &&
    3146             :            "Load should return a value and a chain");
    3147       75121 :     return Result;
    3148             :   }
    3149             : 
    3150          51 :   case ISD::FSIN:
    3151             :   case ISD::FCOS:
    3152          51 :     return LowerTrig(Op, DAG);
    3153        1704 :   case ISD::SELECT: return LowerSELECT(Op, DAG);
    3154         264 :   case ISD::FDIV: return LowerFDIV(Op, DAG);
    3155         251 :   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
    3156       62784 :   case ISD::STORE: return LowerSTORE(Op, DAG);
    3157         762 :   case ISD::GlobalAddress: {
    3158         762 :     MachineFunction &MF = DAG.getMachineFunction();
    3159         762 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    3160         762 :     return LowerGlobalAddress(MFI, Op, DAG);
    3161             :   }
    3162        5602 :   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    3163        1239 :   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
    3164        1915 :   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
    3165          45 :   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
    3166          30 :   case ISD::INSERT_VECTOR_ELT:
    3167          30 :     return lowerINSERT_VECTOR_ELT(Op, DAG);
    3168         436 :   case ISD::EXTRACT_VECTOR_ELT:
    3169         436 :     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
    3170         255 :   case ISD::FP_ROUND:
    3171         255 :     return lowerFP_ROUND(Op, DAG);
    3172             : 
    3173          36 :   case ISD::TRAP:
    3174             :   case ISD::DEBUGTRAP:
    3175          36 :     return lowerTRAP(Op, DAG);
    3176             :   }
    3177             :   return SDValue();
    3178             : }
    3179             : 
    3180         167 : void SITargetLowering::ReplaceNodeResults(SDNode *N,
    3181             :                                           SmallVectorImpl<SDValue> &Results,
    3182             :                                           SelectionDAG &DAG) const {
    3183         334 :   switch (N->getOpcode()) {
    3184          62 :   case ISD::INSERT_VECTOR_ELT: {
    3185          62 :     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
    3186          10 :       Results.push_back(Res);
    3187          62 :     return;
    3188             :   }
    3189           0 :   case ISD::EXTRACT_VECTOR_ELT: {
    3190           0 :     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
    3191           0 :       Results.push_back(Res);
    3192           0 :     return;
    3193             :   }
    3194          54 :   case ISD::INTRINSIC_WO_CHAIN: {
    3195         216 :     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    3196          54 :     if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
    3197         108 :       SDValue Src0 = N->getOperand(1);
    3198         108 :       SDValue Src1 = N->getOperand(2);
    3199         108 :       SDLoc SL(N);
    3200             :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
    3201         108 :                                 Src0, Src1);
    3202         162 :       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
    3203             :       return;
    3204             :     }
    3205             :     break;
    3206             :   }
    3207          40 :   case ISD::SELECT: {
    3208          80 :     SDLoc SL(N);
    3209          80 :     EVT VT = N->getValueType(0);
    3210          40 :     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
    3211          80 :     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
    3212          80 :     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
    3213             : 
    3214          40 :     EVT SelectVT = NewVT;
    3215          40 :     if (NewVT.bitsLT(MVT::i32)) {
    3216           4 :       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
    3217           4 :       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
    3218           2 :       SelectVT = MVT::i32;
    3219             :     }
    3220             : 
    3221             :     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
    3222          80 :                                     N->getOperand(0), LHS, RHS);
    3223             : 
    3224          40 :     if (NewVT != SelectVT)
    3225           2 :       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
    3226          80 :     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
    3227             :     return;
    3228             :   }
    3229             :   default:
    3230             :     break;
    3231             :   }
    3232             : }
    3233             : 
    3234             : /// \brief Helper function for LowerBRCOND
    3235             : static SDNode *findUser(SDValue Value, unsigned Opcode) {
    3236             : 
    3237         645 :   SDNode *Parent = Value.getNode();
    3238             :   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    3239        1384 :        I != E; ++I) {
    3240             : 
    3241        2762 :     if (I.getUse().get() != Value)
    3242             :       continue;
    3243             : 
    3244         642 :     if (I->getOpcode() == Opcode)
    3245             :       return *I;
    3246             :   }
    3247             :   return nullptr;
    3248             : }
    3249             : 
    3250        1500 : unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    3251        1500 :   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
    3252        1656 :     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
    3253             :     case Intrinsic::amdgcn_if:
    3254             :       return AMDGPUISD::IF;
    3255          45 :     case Intrinsic::amdgcn_else:
    3256          45 :       return AMDGPUISD::ELSE;
    3257          44 :     case Intrinsic::amdgcn_loop:
    3258          44 :       return AMDGPUISD::LOOP;
    3259           0 :     case Intrinsic::amdgcn_end_cf:
    3260           0 :       llvm_unreachable("should not occur");
    3261           2 :     default:
    3262           2 :       return 0;
    3263             :     }
    3264             :   }
    3265             : 
    3266             :   // break, if_break, else_break are all only used as inputs to loop, not
    3267             :   // directly as branch conditions.
    3268             :   return 0;
    3269             : }
    3270             : 
    3271           4 : void SITargetLowering::createDebuggerPrologueStackObjects(
    3272             :     MachineFunction &MF) const {
    3273             :   // Create stack objects that are used for emitting debugger prologue.
    3274             :   //
    3275             :   // Debugger prologue writes work group IDs and work item IDs to scratch memory
    3276             :   // at fixed location in the following format:
    3277             :   //   offset 0:  work group ID x
    3278             :   //   offset 4:  work group ID y
    3279             :   //   offset 8:  work group ID z
    3280             :   //   offset 16: work item ID x
    3281             :   //   offset 20: work item ID y
    3282             :   //   offset 24: work item ID z
    3283           4 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3284           4 :   int ObjectIdx = 0;
    3285             : 
    3286             :   // For each dimension:
    3287          16 :   for (unsigned i = 0; i < 3; ++i) {
    3288             :     // Create fixed stack object for work group ID.
    3289          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
    3290          12 :     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
    3291             :     // Create fixed stack object for work item ID.
    3292          12 :     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
    3293          12 :     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
    3294             :   }
    3295           4 : }
    3296             : 
    3297        1049 : bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
    3298        2098 :   const Triple &TT = getTargetMachine().getTargetTriple();
    3299        2177 :   return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
    3300        1128 :          AMDGPU::shouldEmitConstantsToTextSection(TT);
    3301             : }
    3302             : 
    3303         542 : bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
    3304        1576 :   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    3305        1088 :               GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
    3306         707 :          !shouldEmitFixup(GV) &&
    3307         603 :          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
    3308             : }
    3309             : 
    3310         463 : bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
    3311         463 :   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
    3312             : }
    3313             : 
    3314             : /// This transforms the control flow intrinsics to get the branch destination as
    3315             : /// last parameter, also switches branch target with BR if the need arise
    3316        1500 : SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    3317             :                                       SelectionDAG &DAG) const {
    3318        3000 :   SDLoc DL(BRCOND);
    3319             : 
    3320        3000 :   SDNode *Intr = BRCOND.getOperand(1).getNode();
    3321        3000 :   SDValue Target = BRCOND.getOperand(2);
    3322        1500 :   SDNode *BR = nullptr;
    3323        1500 :   SDNode *SetCC = nullptr;
    3324             : 
    3325        1500 :   if (Intr->getOpcode() == ISD::SETCC) {
    3326             :     // As long as we negate the condition everything is fine
    3327        1223 :     SetCC = Intr;
    3328        2446 :     Intr = SetCC->getOperand(0).getNode();
    3329             : 
    3330             :   } else {
    3331             :     // Get the target from BR if we don't negate the condition
    3332         554 :     BR = findUser(BRCOND, ISD::BR);
    3333         554 :     Target = BR->getOperand(1);
    3334             :   }
    3335             : 
    3336             :   // FIXME: This changes the types of the intrinsics instead of introducing new
    3337             :   // nodes with the correct types.
    3338             :   // e.g. llvm.amdgcn.loop
    3339             : 
    3340             :   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
    3341             :   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
    3342             : 
    3343        1500 :   unsigned CFNode = isCFIntrinsic(Intr);
    3344        1500 :   if (CFNode == 0) {
    3345             :     // This is a uniform branch so we don't need to legalize.
    3346        1088 :     return BRCOND;
    3347             :   }
    3348             : 
    3349        1236 :   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
    3350         824 :                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
    3351             : 
    3352             :   assert(!SetCC ||
    3353             :         (SetCC->getConstantOperandVal(1) == 1 &&
    3354             :          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    3355             :                                                              ISD::SETNE));
    3356             : 
    3357             :   // operands of the new intrinsic call
    3358         412 :   SmallVector<SDValue, 4> Ops;
    3359         412 :   if (HaveChain)
    3360         824 :     Ops.push_back(BRCOND.getOperand(0));
    3361             : 
    3362         824 :   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
    3363         412 :   Ops.push_back(Target);
    3364             : 
    3365        1236 :   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
    3366             : 
    3367             :   // build the new intrinsic call
    3368         412 :   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
    3369             : 
    3370         412 :   if (!HaveChain) {
    3371             :     SDValue Ops[] =  {
    3372             :       SDValue(Result, 0),
    3373           0 :       BRCOND.getOperand(0)
    3374           0 :     };
    3375             : 
    3376           0 :     Result = DAG.getMergeValues(Ops, DL).getNode();
    3377             :   }
    3378             : 
    3379         412 :   if (BR) {
    3380             :     // Give the branch instruction our target
    3381             :     SDValue Ops[] = {
    3382         162 :       BR->getOperand(0),
    3383         162 :       BRCOND.getOperand(2)
    3384         162 :     };
    3385         162 :     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
    3386          81 :     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
    3387          81 :     BR = NewBR.getNode();
    3388             :   }
    3389             : 
    3390        1236 :   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    3391             : 
    3392             :   // Copy the intrinsic results to registers
    3393        1192 :   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    3394         736 :     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    3395         368 :     if (!CopyToReg)
    3396           3 :       continue;
    3397             : 
    3398         365 :     Chain = DAG.getCopyToReg(
    3399             :       Chain, DL,
    3400         730 :       CopyToReg->getOperand(1),
    3401             :       SDValue(Result, i - 1),
    3402        1095 :       SDValue());
    3403             : 
    3404        1095 :     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    3405             :   }
    3406             : 
    3407             :   // Remove the old intrinsic from the chain
    3408         824 :   DAG.ReplaceAllUsesOfValueWith(
    3409         412 :     SDValue(Intr, Intr->getNumValues() - 1),
    3410         824 :     Intr->getOperand(0));
    3411             : 
    3412         412 :   return Chain;
    3413             : }
    3414             : 
    3415        2426 : SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
    3416             :                                             SDValue Op,
    3417             :                                             const SDLoc &DL,
    3418             :                                             EVT VT) const {
    3419        7278 :   return Op.getValueType().bitsLE(VT) ?
    3420        4840 :       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
    3421        7278 :       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
    3422             : }
    3423             : 
    3424         255 : SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
    3425             :   assert(Op.getValueType() == MVT::f16 &&
    3426             :          "Do not know how to custom lower FP_ROUND for non-f16 type");
    3427             : 
    3428         510 :   SDValue Src = Op.getOperand(0);
    3429         510 :   EVT SrcVT = Src.getValueType();
    3430         255 :   if (SrcVT != MVT::f64)
    3431         245 :     return Op;
    3432             : 
    3433          10 :   SDLoc DL(Op);
    3434             : 
    3435          20 :   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
    3436          20 :   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
    3437          20 :   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
    3438             : }
    3439             : 
    3440          36 : SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
    3441          72 :   SDLoc SL(Op);
    3442          36 :   MachineFunction &MF = DAG.getMachineFunction();
    3443          72 :   SDValue Chain = Op.getOperand(0);
    3444             : 
    3445          72 :   unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
    3446          36 :     SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
    3447             : 
    3448          68 :   if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
    3449          16 :       Subtarget->isTrapHandlerEnabled()) {
    3450           8 :     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3451           8 :     unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    3452             :     assert(UserSGPR != AMDGPU::NoRegister);
    3453             : 
    3454             :     SDValue QueuePtr = CreateLiveInRegister(
    3455          16 :       DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    3456             : 
    3457           8 :     SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
    3458             : 
    3459             :     SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
    3460           8 :                                      QueuePtr, SDValue());
    3461             : 
    3462             :     SDValue Ops[] = {
    3463             :       ToReg,
    3464           8 :       DAG.getTargetConstant(TrapID, SL, MVT::i16),
    3465             :       SGPR01,
    3466             :       ToReg.getValue(1)
    3467          24 :     };
    3468             : 
    3469          24 :     return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
    3470             :   }
    3471             : 
    3472          28 :   switch (TrapID) {
    3473          21 :   case SISubtarget::TrapIDLLVMTrap:
    3474          42 :     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
    3475           7 :   case SISubtarget::TrapIDLLVMDebugTrap: {
    3476           7 :     DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
    3477             :                                      "debugtrap handler not supported",
    3478             :                                      Op.getDebugLoc(),
    3479          28 :                                      DS_Warning);
    3480           7 :     LLVMContext &Ctx = MF.getFunction()->getContext();
    3481           7 :     Ctx.diagnose(NoTrap);
    3482           7 :     return Chain;
    3483             :   }
    3484           0 :   default:
    3485           0 :     llvm_unreachable("unsupported trap handler type!");
    3486             :   }
    3487             : 
    3488             :   return Chain;
    3489             : }
    3490             : 
    3491          32 : SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
    3492             :                                              SelectionDAG &DAG) const {
    3493             :   // FIXME: Use inline constants (src_{shared, private}_base) instead.
    3494          32 :   if (Subtarget->hasApertureRegs()) {
    3495          12 :     unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
    3496             :         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
    3497             :         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
    3498          12 :     unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
    3499             :         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
    3500             :         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
    3501          12 :     unsigned Encoding =
    3502             :         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
    3503          12 :         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
    3504             :         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
    3505             : 
    3506          24 :     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
    3507             :     SDValue ApertureReg = SDValue(
    3508          24 :         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
    3509          24 :     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
    3510          24 :     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
    3511             :   }
    3512             : 
    3513          20 :   MachineFunction &MF = DAG.getMachineFunction();
    3514          20 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    3515          20 :   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
    3516             :   assert(UserSGPR != AMDGPU::NoRegister);
    3517             : 
    3518             :   SDValue QueuePtr = CreateLiveInRegister(
    3519          40 :     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
    3520             : 
    3521             :   // Offset into amd_queue_t for group_segment_aperture_base_hi /
    3522             :   // private_segment_aperture_base_hi.
    3523          20 :   uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
    3524             : 
    3525             :   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
    3526          60 :                             DAG.getConstant(StructOffset, DL, MVT::i64));
    3527             : 
    3528             :   // TODO: Use custom target PseudoSourceValue.
    3529             :   // TODO: We should use the value from the IR intrinsic call, but it might not
    3530             :   // be available and how do we get it?
    3531          20 :   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
    3532          20 :                                               AMDGPUASI.CONSTANT_ADDRESS));
    3533             : 
    3534          40 :   MachinePointerInfo PtrInfo(V, StructOffset);
    3535             :   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
    3536          40 :                      MinAlign(64, StructOffset),
    3537          20 :                      MachineMemOperand::MODereferenceable |
    3538          80 :                          MachineMemOperand::MOInvariant);
    3539             : }
    3540             : 
    3541          45 : SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
    3542             :                                              SelectionDAG &DAG) const {
    3543          90 :   SDLoc SL(Op);
    3544          45 :   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
    3545             : 
    3546          90 :   SDValue Src = ASC->getOperand(0);
    3547          45 :   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
    3548             : 
    3549             :   const AMDGPUTargetMachine &TM =
    3550          45 :     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
    3551             : 
    3552             :   // flat -> local/private
    3553          45 :   if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    3554          12 :     unsigned DestAS = ASC->getDestAddressSpace();
    3555             : 
    3556          17 :     if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
    3557           5 :         DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
    3558          24 :       unsigned NullVal = TM.getNullPointerValue(DestAS);
    3559          12 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    3560          12 :       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
    3561          24 :       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
    3562             : 
    3563             :       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
    3564          12 :                          NonNull, Ptr, SegmentNullPtr);
    3565             :     }
    3566             :   }
    3567             : 
    3568             :   // local/private -> flat
    3569          33 :   if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
    3570          32 :     unsigned SrcAS = ASC->getSrcAddressSpace();
    3571             : 
    3572          54 :     if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
    3573          22 :         SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
    3574          64 :       unsigned NullVal = TM.getNullPointerValue(SrcAS);
    3575          32 :       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
    3576             : 
    3577             :       SDValue NonNull
    3578          32 :         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
    3579             : 
    3580          32 :       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
    3581             :       SDValue CvtPtr
    3582          64 :         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
    3583             : 
    3584             :       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
    3585          32 :                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
    3586          96 :                          FlatNullPtr);
    3587             :     }
    3588             :   }
    3589             : 
    3590             :   // global <-> flat are no-ops and never emitted.
    3591             : 
    3592           1 :   const MachineFunction &MF = DAG.getMachineFunction();
    3593             :   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
    3594           2 :     *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
    3595           1 :   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
    3596             : 
    3597           2 :   return DAG.getUNDEF(ASC->getValueType(0));
    3598             : }
    3599             : 
    3600          92 : SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    3601             :                                                  SelectionDAG &DAG) const {
    3602         184 :   SDValue Idx = Op.getOperand(2);
    3603          14 :   if (isa<ConstantSDNode>(Idx))
    3604          78 :     return SDValue();
    3605             : 
    3606             :   // Avoid stack access for dynamic indexing.
    3607          14 :   SDLoc SL(Op);
    3608          28 :   SDValue Vec = Op.getOperand(0);
    3609          42 :   SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
    3610             : 
    3611             :   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
    3612          28 :   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
    3613             : 
    3614             :   // Convert vector index to bit-index.
    3615             :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
    3616          42 :                                   DAG.getConstant(16, SL, MVT::i32));
    3617             : 
    3618          28 :   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    3619             : 
    3620             :   SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
    3621          14 :                             DAG.getConstant(0xffff, SL, MVT::i32),
    3622          42 :                             ScaledIdx);
    3623             : 
    3624          28 :   SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
    3625             :   SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
    3626          42 :                             DAG.getNOT(SL, BFM, MVT::i32), BCVec);
    3627             : 
    3628          28 :   SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
    3629          28 :   return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
    3630             : }
    3631             : 
    3632         436 : SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
    3633             :                                                   SelectionDAG &DAG) const {
    3634         872 :   SDLoc SL(Op);
    3635             : 
    3636         872 :   EVT ResultVT = Op.getValueType();
    3637         872 :   SDValue Vec = Op.getOperand(0);
    3638         872 :   SDValue Idx = Op.getOperand(1);
    3639             : 
    3640         436 :   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
    3641             : 
    3642             :   // Make sure we we do any optimizations that will make it easier to fold
    3643             :   // source modifiers before obscuring it with bit operations.
    3644             : 
    3645             :   // XXX - Why doesn't this get called when vector_shuffle is expanded?
    3646         436 :   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
    3647           7 :     return Combined;
    3648             : 
    3649         416 :   if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
    3650         832 :     SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    3651             : 
    3652         416 :     if (CIdx->getZExtValue() == 1) {
    3653         180 :       Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
    3654         720 :                            DAG.getConstant(16, SL, MVT::i32));
    3655             :     } else {
    3656             :       assert(CIdx->getZExtValue() == 0);
    3657             :     }
    3658             : 
    3659         416 :     if (ResultVT.bitsLT(MVT::i32))
    3660         690 :       Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
    3661         416 :     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    3662             :   }
    3663             : 
    3664          13 :   SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
    3665             : 
    3666             :   // Convert vector index to bit-index.
    3667          26 :   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
    3668             : 
    3669          26 :   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
    3670          26 :   SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
    3671             : 
    3672          13 :   SDValue Result = Elt;
    3673          13 :   if (ResultVT.bitsLT(MVT::i32))
    3674          14 :     Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
    3675             : 
    3676          13 :   return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
    3677             : }
    3678             : 
    3679             : bool
    3680        1524 : SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    3681             :   // We can fold offsets for anything that doesn't require a GOT relocation.
    3682        3020 :   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
    3683        1603 :               GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
    3684        1603 :          !shouldEmitGOTReloc(GA->getGlobal());
    3685             : }
    3686             : 
    3687             : static SDValue
    3688         482 : buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
    3689             :                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
    3690             :                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
    3691             :   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
    3692             :   // lowered to the following code sequence:
    3693             :   //
    3694             :   // For constant address space:
    3695             :   //   s_getpc_b64 s[0:1]
    3696             :   //   s_add_u32 s0, s0, $symbol
    3697             :   //   s_addc_u32 s1, s1, 0
    3698             :   //
    3699             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    3700             :   //   a fixup or relocation is emitted to replace $symbol with a literal
    3701             :   //   constant, which is a pc-relative offset from the encoding of the $symbol
    3702             :   //   operand to the global variable.
    3703             :   //
    3704             :   // For global address space:
    3705             :   //   s_getpc_b64 s[0:1]
    3706             :   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
    3707             :   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
    3708             :   //
    3709             :   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
    3710             :   //   fixups or relocations are emitted to replace $symbol@*@lo and
    3711             :   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
    3712             :   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
    3713             :   //   operand to the global variable.
    3714             :   //
    3715             :   // What we want here is an offset from the value returned by s_getpc
    3716             :   // (which is the address of the s_add_u32 instruction) to the global
    3717             :   // variable, but since the encoding of $symbol starts 4 bytes after the start
    3718             :   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
    3719             :   // small. This requires us to add 4 to the global variable offset in order to
    3720             :   // compute the correct address.
    3721         482 :   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    3722        1928 :                                              GAFlags);
    3723             :   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
    3724             :                                              GAFlags == SIInstrInfo::MO_NONE ?
    3725        1446 :                                              GAFlags : GAFlags + 1);
    3726         482 :   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
    3727             : }
    3728             : 
    3729         762 : SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    3730             :                                              SDValue Op,
    3731             :                                              SelectionDAG &DAG) const {
    3732         762 :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    3733         762 :   const GlobalValue *GV = GSD->getGlobal();
    3734             : 
    3735        1502 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
    3736        1480 :       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
    3737             :       // FIXME: It isn't correct to rely on the type of the pointer. This should
    3738             :       // be removed when address space 0 is 64-bit.
    3739        1436 :       !GV->getType()->getElementType()->isFunctionTy())
    3740         280 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    3741             : 
    3742         482 :   SDLoc DL(GSD);
    3743         964 :   EVT PtrVT = Op.getValueType();
    3744             : 
    3745         482 :   if (shouldEmitFixup(GV))
    3746          19 :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    3747         463 :   else if (shouldEmitPCReloc(GV))
    3748             :     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
    3749         442 :                                    SIInstrInfo::MO_REL32);
    3750             : 
    3751             :   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
    3752          21 :                                             SIInstrInfo::MO_GOTPCREL32);
    3753             : 
    3754          21 :   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
    3755          21 :   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    3756          42 :   const DataLayout &DataLayout = DAG.getDataLayout();
    3757          21 :   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
    3758             :   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
    3759          42 :   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
    3760             : 
    3761             :   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
    3762          21 :                      MachineMemOperand::MODereferenceable |
    3763          63 :                          MachineMemOperand::MOInvariant);
    3764             : }
    3765             : 
    3766        7543 : SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
    3767             :                                    const SDLoc &DL, SDValue V) const {
    3768             :   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
    3769             :   // the destination register.
    3770             :   //
    3771             :   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
    3772             :   // so we will end up with redundant moves to m0.
    3773             :   //
    3774             :   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
    3775             : 
    3776             :   // A Null SDValue creates a glue result.
    3777        7543 :   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
    3778       15086 :                                   V, Chain);
    3779        7543 :   return SDValue(M0, 0);
    3780             : }
    3781             : 
    3782          85 : SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
    3783             :                                                  SDValue Op,
    3784             :                                                  MVT VT,
    3785             :                                                  unsigned Offset) const {
    3786         170 :   SDLoc SL(Op);
    3787             :   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
    3788         340 :                                            DAG.getEntryNode(), Offset, false);
    3789             :   // The local size values will have the hi 16-bits as zero.
    3790             :   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
    3791         340 :                      DAG.getValueType(VT));
    3792             : }
    3793             : 
    3794           2 : static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    3795             :                                         EVT VT) {
    3796           2 :   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
    3797             :                                       "non-hsa intrinsic with hsa target",
    3798           6 :                                       DL.getDebugLoc());
    3799           2 :   DAG.getContext()->diagnose(BadIntrin);
    3800           4 :   return DAG.getUNDEF(VT);
    3801             : }
    3802             : 
    3803           5 : static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
    3804             :                                          EVT VT) {
    3805           5 :   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
    3806             :                                       "intrinsic not supported on subtarget",
    3807          15 :                                       DL.getDebugLoc());
    3808           5 :   DAG.getContext()->diagnose(BadIntrin);
    3809          10 :   return DAG.getUNDEF(VT);
    3810             : }
    3811             : 
    3812        5602 : SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    3813             :                                                   SelectionDAG &DAG) const {
    3814        5602 :   MachineFunction &MF = DAG.getMachineFunction();
    3815        5602 :   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    3816             : 
    3817       11204 :   EVT VT = Op.getValueType();
    3818       11204 :   SDLoc DL(Op);
    3819       22408 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    3820             : 
    3821             :   // TODO: Should this propagate fast-math-flags?
    3822             : 
    3823        5602 :   switch (IntrinsicID) {
    3824           4 :   case Intrinsic::amdgcn_implicit_buffer_ptr: {
    3825           4 :     if (getSubtarget()->isAmdCodeObjectV2(MF))
    3826           2 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3827             :     return getPreloadedValue(DAG, *MFI, VT,
    3828           2 :                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
    3829             :   }
    3830          38 :   case Intrinsic::amdgcn_dispatch_ptr:
    3831             :   case Intrinsic::amdgcn_queue_ptr: {
    3832          38 :     if (!Subtarget->isAmdCodeObjectV2(MF)) {
    3833             :       DiagnosticInfoUnsupported BadIntrin(
    3834           2 :           *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
    3835           6 :           DL.getDebugLoc());
    3836           2 :       DAG.getContext()->diagnose(BadIntrin);
    3837           2 :       return DAG.getUNDEF(VT);
    3838             :     }
    3839             : 
    3840          36 :     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
    3841             :       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
    3842          36 :     return getPreloadedValue(DAG, *MFI, VT, RegID);
    3843             :   }
    3844          25 :   case Intrinsic::amdgcn_implicitarg_ptr: {
    3845          25 :     if (MFI->isEntryFunction())
    3846          19 :       return getImplicitArgPtr(DAG, DL);
    3847             :     return getPreloadedValue(DAG, *MFI, VT,
    3848           6 :                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    3849             :   }
    3850          27 :   case Intrinsic::amdgcn_kernarg_segment_ptr: {
    3851             :     return getPreloadedValue(DAG, *MFI, VT,
    3852          27 :                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    3853             :   }
    3854           9 :   case Intrinsic::amdgcn_dispatch_id: {
    3855           9 :     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
    3856             :   }
    3857          20 :   case Intrinsic::amdgcn_rcp:
    3858          40 :     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    3859          31 :   case Intrinsic::amdgcn_rsq:
    3860          62 :     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    3861           5 :   case Intrinsic::amdgcn_rsq_legacy:
    3862           5 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    3863           1 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    3864             : 
    3865           8 :     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    3866          11 :   case Intrinsic::amdgcn_rcp_legacy:
    3867          11 :     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    3868           4 :       return emitRemovedIntrinsicError(DAG, DL, VT);
    3869          14 :     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
    3870           6 :   case Intrinsic::amdgcn_rsq_clamp: {
    3871           6 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    3872           6 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    3873             : 
    3874           3 :     Type *Type = VT.getTypeForEVT(*DAG.getContext());
    3875           6 :     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    3876           9 :     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    3877             : 
    3878           6 :     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    3879             :     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    3880           3 :                               DAG.getConstantFP(Max, DL, VT));
    3881             :     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    3882           3 :                        DAG.getConstantFP(Min, DL, VT));
    3883             :   }
    3884           2 :   case Intrinsic::r600_read_ngroups_x:
    3885           4 :     if (Subtarget->isAmdHsaOS())
    3886           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3887             : 
    3888             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3889           2 :                                     SI::KernelInputOffsets::NGROUPS_X, false);
    3890           2 :   case Intrinsic::r600_read_ngroups_y:
    3891           4 :     if (Subtarget->isAmdHsaOS())
    3892           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3893             : 
    3894             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3895           2 :                                     SI::KernelInputOffsets::NGROUPS_Y, false);
    3896           2 :   case Intrinsic::r600_read_ngroups_z:
    3897           4 :     if (Subtarget->isAmdHsaOS())
    3898           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3899             : 
    3900             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3901           2 :                                     SI::KernelInputOffsets::NGROUPS_Z, false);
    3902           2 :   case Intrinsic::r600_read_global_size_x:
    3903           4 :     if (Subtarget->isAmdHsaOS())
    3904           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3905             : 
    3906             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3907           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
    3908           2 :   case Intrinsic::r600_read_global_size_y:
    3909           4 :     if (Subtarget->isAmdHsaOS())
    3910           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3911             : 
    3912             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3913           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
    3914           2 :   case Intrinsic::r600_read_global_size_z:
    3915           4 :     if (Subtarget->isAmdHsaOS())
    3916           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3917             : 
    3918             :     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
    3919           2 :                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
    3920          13 :   case Intrinsic::r600_read_local_size_x:
    3921          26 :     if (Subtarget->isAmdHsaOS())
    3922           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3923             : 
    3924             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    3925          13 :                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
    3926          36 :   case Intrinsic::r600_read_local_size_y:
    3927          72 :     if (Subtarget->isAmdHsaOS())
    3928           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3929             : 
    3930             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    3931          36 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
    3932          36 :   case Intrinsic::r600_read_local_size_z:
    3933          72 :     if (Subtarget->isAmdHsaOS())
    3934           0 :       return emitNonHSAIntrinsicError(DAG, DL, VT);
    3935             : 
    3936             :     return lowerImplicitZextParam(DAG, Op, MVT::i16,
    3937          36 :                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
    3938          41 :   case Intrinsic::amdgcn_workgroup_id_x:
    3939             :   case Intrinsic::r600_read_tgid_x:
    3940             :     return getPreloadedValue(DAG, *MFI, VT,
    3941          41 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    3942          24 :   case Intrinsic::amdgcn_workgroup_id_y:
    3943             :   case Intrinsic::r600_read_tgid_y:
    3944             :     return getPreloadedValue(DAG, *MFI, VT,
    3945          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    3946          24 :   case Intrinsic::amdgcn_workgroup_id_z:
    3947             :   case Intrinsic::r600_read_tgid_z:
    3948             :     return getPreloadedValue(DAG, *MFI, VT,
    3949          24 :                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    3950        2686 :   case Intrinsic::amdgcn_workitem_id_x: {
    3951             :   case Intrinsic::r600_read_tidig_x:
    3952             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    3953        8058 :                           SDLoc(DAG.getEntryNode()),
    3954        5372 :                           MFI->getArgInfo().WorkItemIDX);
    3955             :   }
    3956          94 :   case Intrinsic::amdgcn_workitem_id_y:
    3957             :   case Intrinsic::r600_read_tidig_y:
    3958             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    3959         282 :                           SDLoc(DAG.getEntryNode()),
    3960         188 :                           MFI->getArgInfo().WorkItemIDY);
    3961          57 :   case Intrinsic::amdgcn_workitem_id_z:
    3962             :   case Intrinsic::r600_read_tidig_z:
    3963             :     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
    3964         171 :                           SDLoc(DAG.getEntryNode()),
    3965         114 :                           MFI->getArgInfo().WorkItemIDZ);
    3966         489 :   case AMDGPUIntrinsic::SI_load_const: {
    3967             :     SDValue Ops[] = {
    3968         978 :       Op.getOperand(1),
    3969         978 :       Op.getOperand(2)
    3970         978 :     };
    3971             : 
    3972        2445 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    3973             :         MachinePointerInfo(),
    3974         978 :         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    3975             :             MachineMemOperand::MOInvariant,
    3976         978 :         VT.getStoreSize(), 4);
    3977             :     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
    3978         978 :                                    Op->getVTList(), Ops, VT, MMO);
    3979             :   }
    3980           4 :   case Intrinsic::amdgcn_fdiv_fast:
    3981           4 :     return lowerFDIV_FAST(Op, DAG);
    3982          75 :   case Intrinsic::amdgcn_interp_mov: {
    3983         225 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    3984         150 :     SDValue Glue = M0.getValue(1);
    3985         150 :     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
    3986         300 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    3987             :   }
    3988         204 :   case Intrinsic::amdgcn_interp_p1: {
    3989         612 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
    3990         408 :     SDValue Glue = M0.getValue(1);
    3991         408 :     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
    3992         816 :                        Op.getOperand(2), Op.getOperand(3), Glue);
    3993             :   }
    3994         188 :   case Intrinsic::amdgcn_interp_p2: {
    3995         564 :     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
    3996         376 :     SDValue Glue = SDValue(M0.getNode(), 1);
    3997         376 :     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
    3998         752 :                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
    3999         188 :                        Glue);
    4000             :   }
    4001           5 :   case Intrinsic::amdgcn_sin:
    4002          10 :     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
    4003             : 
    4004           3 :   case Intrinsic::amdgcn_cos:
    4005           6 :     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
    4006             : 
    4007           3 :   case Intrinsic::amdgcn_log_clamp: {
    4008           3 :     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    4009           2 :       return SDValue();
    4010             : 
    4011             :     DiagnosticInfoUnsupported BadIntrin(
    4012           1 :       *MF.getFunction(), "intrinsic not supported on subtarget",
    4013           2 :       DL.getDebugLoc());
    4014           1 :       DAG.getContext()->diagnose(BadIntrin);
    4015           1 :       return DAG.getUNDEF(VT);
    4016             :   }
    4017           9 :   case Intrinsic::amdgcn_ldexp:
    4018             :     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
    4019          27 :                        Op.getOperand(1), Op.getOperand(2));
    4020             : 
    4021           7 :   case Intrinsic::amdgcn_fract:
    4022          14 :     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    4023             : 
    4024          55 :   case Intrinsic::amdgcn_class:
    4025             :     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
    4026         165 :                        Op.getOperand(1), Op.getOperand(2));
    4027          10 :   case Intrinsic::amdgcn_div_fmas:
    4028             :     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    4029          40 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    4030          30 :                        Op.getOperand(4));
    4031             : 
    4032          13 :   case Intrinsic::amdgcn_div_fixup:
    4033             :     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    4034          52 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4035             : 
    4036           4 :   case Intrinsic::amdgcn_trig_preop:
    4037             :     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    4038          12 :                        Op.getOperand(1), Op.getOperand(2));
    4039          27 :   case Intrinsic::amdgcn_div_scale: {
    4040             :     // 3rd parameter required to be a constant.
    4041          78 :     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4042             :     if (!Param)
    4043           9 :       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
    4044             : 
    4045             :     // Translate to the operands expected by the machine instruction. The
    4046             :     // first parameter must be the same as the first instruction.
    4047          48 :     SDValue Numerator = Op.getOperand(1);
    4048          48 :     SDValue Denominator = Op.getOperand(2);
    4049             : 
    4050             :     // Note this order is opposite of the machine instruction's operations,
    4051             :     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    4052             :     // intrinsic has the numerator as the first operand to match a normal
    4053             :     // division operation.
    4054             : 
    4055          24 :     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    4056             : 
    4057             :     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    4058          48 :                        Denominator, Numerator);
    4059             :   }
    4060          48 :   case Intrinsic::amdgcn_icmp: {
    4061         138 :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4062             :     if (!CD)
    4063           6 :       return DAG.getUNDEF(VT);
    4064             : 
    4065          42 :     int CondCode = CD->getSExtValue();
    4066          42 :     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
    4067             :         CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
    4068           2 :       return DAG.getUNDEF(VT);
    4069             : 
    4070          40 :     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
    4071          40 :     ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
    4072          80 :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4073         120 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4074             :   }
    4075          56 :   case Intrinsic::amdgcn_fcmp: {
    4076         166 :     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    4077             :     if (!CD)
    4078           2 :       return DAG.getUNDEF(VT);
    4079             : 
    4080          54 :     int CondCode = CD->getSExtValue();
    4081          54 :     if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
    4082             :         CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
    4083           2 :       return DAG.getUNDEF(VT);
    4084             : 
    4085          52 :     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
    4086          52 :     ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
    4087         104 :     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
    4088         156 :                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
    4089             :   }
    4090          69 :   case Intrinsic::amdgcn_fmed3:
    4091             :     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
    4092         276 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4093          31 :   case Intrinsic::amdgcn_fmul_legacy:
    4094             :     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
    4095          93 :                        Op.getOperand(1), Op.getOperand(2));
    4096           4 :   case Intrinsic::amdgcn_sffbh:
    4097           8 :     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
    4098         102 :   case Intrinsic::amdgcn_sbfe:
    4099             :     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    4100         408 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4101          94 :   case Intrinsic::amdgcn_ubfe:
    4102             :     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    4103         376 :                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    4104          10 :   case Intrinsic::amdgcn_cvt_pkrtz: {
    4105             :     // FIXME: Stop adding cast if v2f16 legal.
    4106          20 :     EVT VT = Op.getValueType();
    4107             :     SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
    4108          40 :                                Op.getOperand(1), Op.getOperand(2));
    4109          10 :     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    4110             :   }
    4111           8 :   case Intrinsic::amdgcn_wqm: {
    4112          16 :     SDValue Src = Op.getOperand(1);
    4113          16 :     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
    4114           8 :                    0);
    4115             :   }
    4116          14 :   case Intrinsic::amdgcn_wwm: {
    4117          28 :     SDValue Src = Op.getOperand(1);
    4118          28 :     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
    4119          14 :                    0);
    4120             :   }
    4121         871 :   default:
    4122         871 :     return Op;
    4123             :   }
    4124             : }
    4125             : 
    4126        1239 : SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
    4127             :                                                  SelectionDAG &DAG) const {
    4128        4956 :   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    4129        2478 :   SDLoc DL(Op);
    4130        1239 :   MachineFunction &MF = DAG.getMachineFunction();
    4131             : 
    4132        1239 :   switch (IntrID) {
    4133         150 :   case Intrinsic::amdgcn_atomic_inc:
    4134             :   case Intrinsic::amdgcn_atomic_dec: {
    4135         150 :     MemSDNode *M = cast<MemSDNode>(Op);
    4136         150 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
    4137             :       AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
    4138             :     SDValue Ops[] = {
    4139         300 :       M->getOperand(0), // Chain
    4140         300 :       M->getOperand(2), // Ptr
    4141         300 :       M->getOperand(3)  // Value
    4142         450 :     };
    4143             : 
    4144         300 :     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
    4145         750 :                                    M->getMemoryVT(), M->getMemOperand());
    4146             :   }
    4147         125 :   case Intrinsic::amdgcn_buffer_load:
    4148             :   case Intrinsic::amdgcn_buffer_load_format: {
    4149             :     SDValue Ops[] = {
    4150         250 :       Op.getOperand(0), // Chain
    4151         250 :       Op.getOperand(2), // rsrc
    4152         250 :       Op.getOperand(3), // vindex
    4153         250 :       Op.getOperand(4), // offset
    4154         250 :       Op.getOperand(5), // glc
    4155         250 :       Op.getOperand(6)  // slc
    4156         750 :     };
    4157         125 :     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    4158             : 
    4159         125 :     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
    4160             :         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
    4161         250 :     EVT VT = Op.getValueType();
    4162         125 :     EVT IntVT = VT.changeTypeToInteger();
    4163             : 
    4164         625 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4165         125 :       MachinePointerInfo(MFI->getBufferPSV()),
    4166             :       MachineMemOperand::MOLoad,
    4167         250 :       VT.getStoreSize(), VT.getStoreSize());
    4168             : 
    4169         250 :     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
    4170             :   }
    4171          28 :   case Intrinsic::amdgcn_tbuffer_load: {
    4172             :     SDValue Ops[] = {
    4173          56 :       Op.getOperand(0),  // Chain
    4174          56 :       Op.getOperand(2),  // rsrc
    4175          56 :       Op.getOperand(3),  // vindex
    4176          56 :       Op.getOperand(4),  // voffset
    4177          56 :       Op.getOperand(5),  // soffset
    4178          56 :       Op.getOperand(6),  // offset
    4179          56 :       Op.getOperand(7),  // dfmt
    4180          56 :       Op.getOperand(8),  // nfmt
    4181          56 :       Op.getOperand(9),  // glc
    4182          56 :       Op.getOperand(10)   // slc
    4183         280 :     };
    4184             : 
    4185          84 :     EVT VT = Op.getOperand(2).getValueType();
    4186             : 
    4187         140 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4188             :       MachinePointerInfo(),
    4189             :       MachineMemOperand::MOLoad,
    4190          56 :       VT.getStoreSize(), VT.getStoreSize());
    4191             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
    4192          56 :                                    Op->getVTList(), Ops, VT, MMO);
    4193             :   }
    4194             :   // Basic sample.
    4195         528 :   case Intrinsic::amdgcn_image_sample:
    4196             :   case Intrinsic::amdgcn_image_sample_cl:
    4197             :   case Intrinsic::amdgcn_image_sample_d:
    4198             :   case Intrinsic::amdgcn_image_sample_d_cl:
    4199             :   case Intrinsic::amdgcn_image_sample_l:
    4200             :   case Intrinsic::amdgcn_image_sample_b:
    4201             :   case Intrinsic::amdgcn_image_sample_b_cl:
    4202             :   case Intrinsic::amdgcn_image_sample_lz:
    4203             :   case Intrinsic::amdgcn_image_sample_cd:
    4204             :   case Intrinsic::amdgcn_image_sample_cd_cl:
    4205             : 
    4206             :   // Sample with comparison.
    4207             :   case Intrinsic::amdgcn_image_sample_c:
    4208             :   case Intrinsic::amdgcn_image_sample_c_cl:
    4209             :   case Intrinsic::amdgcn_image_sample_c_d:
    4210             :   case Intrinsic::amdgcn_image_sample_c_d_cl:
    4211             :   case Intrinsic::amdgcn_image_sample_c_l:
    4212             :   case Intrinsic::amdgcn_image_sample_c_b:
    4213             :   case Intrinsic::amdgcn_image_sample_c_b_cl:
    4214             :   case Intrinsic::amdgcn_image_sample_c_lz:
    4215             :   case Intrinsic::amdgcn_image_sample_c_cd:
    4216             :   case Intrinsic::amdgcn_image_sample_c_cd_cl:
    4217             : 
    4218             :   // Sample with offsets.
    4219             :   case Intrinsic::amdgcn_image_sample_o:
    4220             :   case Intrinsic::amdgcn_image_sample_cl_o:
    4221             :   case Intrinsic::amdgcn_image_sample_d_o:
    4222             :   case Intrinsic::amdgcn_image_sample_d_cl_o:
    4223             :   case Intrinsic::amdgcn_image_sample_l_o:
    4224             :   case Intrinsic::amdgcn_image_sample_b_o:
    4225             :   case Intrinsic::amdgcn_image_sample_b_cl_o:
    4226             :   case Intrinsic::amdgcn_image_sample_lz_o:
    4227             :   case Intrinsic::amdgcn_image_sample_cd_o:
    4228             :   case Intrinsic::amdgcn_image_sample_cd_cl_o:
    4229             : 
    4230             :   // Sample with comparison and offsets.
    4231             :   case Intrinsic::amdgcn_image_sample_c_o:
    4232             :   case Intrinsic::amdgcn_image_sample_c_cl_o:
    4233             :   case Intrinsic::amdgcn_image_sample_c_d_o:
    4234             :   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
    4235             :   case Intrinsic::amdgcn_image_sample_c_l_o:
    4236             :   case Intrinsic::amdgcn_image_sample_c_b_o:
    4237             :   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
    4238             :   case Intrinsic::amdgcn_image_sample_c_lz_o:
    4239             :   case Intrinsic::amdgcn_image_sample_c_cd_o:
    4240             :   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    4241             : 
    4242             :   case Intrinsic::amdgcn_image_getlod: {
    4243             :     // Replace dmask with everything disabled with undef.
    4244        1582 :     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
    4245         526 :     if (!DMask || DMask->isNullValue()) {
    4246         128 :       SDValue Undef = DAG.getUNDEF(Op.getValueType());
    4247         256 :       return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
    4248             :     }
    4249             : 
    4250         464 :     return SDValue();
    4251             :   }
    4252         408 :   default:
    4253         408 :     return SDValue();
    4254             :   }
    4255             : }
    4256             : 
    4257        1915 : SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
    4258             :                                               SelectionDAG &DAG) const {
    4259        3830 :   SDLoc DL(Op);
    4260        3830 :   SDValue Chain = Op.getOperand(0);
    4261        7660 :   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    4262        1915 :   MachineFunction &MF = DAG.getMachineFunction();
    4263             : 
    4264        1915 :   switch (IntrinsicID) {
    4265         313 :   case Intrinsic::amdgcn_exp: {
    4266         939 :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    4267         939 :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    4268         939 :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
    4269         939 :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
    4270             : 
    4271             :     const SDValue Ops[] = {
    4272             :       Chain,
    4273         626 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    4274         626 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    4275         626 :       Op.getOperand(4), // src0
    4276         626 :       Op.getOperand(5), // src1
    4277         626 :       Op.getOperand(6), // src2
    4278         626 :       Op.getOperand(7), // src3
    4279         313 :       DAG.getTargetConstant(0, DL, MVT::i1), // compr
    4280         626 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    4281        2817 :     };
    4282             : 
    4283         313 :     unsigned Opc = Done->isNullValue() ?
    4284         313 :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    4285         626 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    4286             :   }
    4287          93 :   case Intrinsic::amdgcn_exp_compr: {
    4288         279 :     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
    4289         279 :     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
    4290         186 :     SDValue Src0 = Op.getOperand(4);
    4291         186 :     SDValue Src1 = Op.getOperand(5);
    4292         279 :     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
    4293         279 :     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
    4294             : 
    4295          93 :     SDValue Undef = DAG.getUNDEF(MVT::f32);
    4296             :     const SDValue Ops[] = {
    4297             :       Chain,
    4298         186 :       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
    4299         186 :       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
    4300         279 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
    4301         279 :       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
    4302             :       Undef, // src2
    4303             :       Undef, // src3
    4304          93 :       DAG.getTargetConstant(1, DL, MVT::i1), // compr
    4305         186 :       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
    4306         837 :     };
    4307             : 
    4308          93 :     unsigned Opc = Done->isNullValue() ?
    4309          93 :       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
    4310         186 :     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
    4311             :   }
    4312          24 :   case Intrinsic::amdgcn_s_sendmsg:
    4313             :   case Intrinsic::amdgcn_s_sendmsghalt: {
    4314          24 :     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
    4315             :       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
    4316          48 :     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    4317          48 :     SDValue Glue = Chain.getValue(1);
    4318             :     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
    4319          72 :                        Op.getOperand(2), Glue);
    4320             :   }
    4321           2 :   case Intrinsic::amdgcn_init_exec: {
    4322             :     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
    4323           6 :                        Op.getOperand(2));
    4324             :   }
    4325           4 :   case Intrinsic::amdgcn_init_exec_from_input: {
    4326             :     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
    4327          16 :                        Op.getOperand(2), Op.getOperand(3));
    4328             :   }
    4329          30 :   case AMDGPUIntrinsic::AMDGPU_kill: {
    4330          60 :     SDValue Src = Op.getOperand(2);
    4331          11 :     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
    4332          11 :       if (!K->isNegative())
    4333           4 :         return Chain;
    4334             : 
    4335          21 :       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
    4336          14 :       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
    4337             :     }
    4338             : 
    4339          38 :     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
    4340          38 :     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
    4341             :   }
    4342         135 :   case Intrinsic::amdgcn_s_barrier: {
    4343         135 :     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
    4344         127 :       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    4345         127 :       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
    4346         127 :       if (WGSize <= ST.getWavefrontSize())
    4347          10 :         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
    4348          20 :                                           Op.getOperand(0)), 0);
    4349             :     }
    4350         130 :     return SDValue();
    4351             :   };
    4352          14 :   case AMDGPUIntrinsic::SI_tbuffer_store: {
    4353             : 
    4354             :     // Extract vindex and voffset from vaddr as appropriate
    4355          42 :     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
    4356          42 :     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
    4357          28 :     SDValue VAddr = Op.getOperand(5);
    4358             : 
    4359          28 :     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
    4360             : 
    4361             :     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
    4362             :            "Legacy intrinsic doesn't support both offset and index - use new version");
    4363             : 
    4364          14 :     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
    4365          14 :     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
    4366             : 
    4367             :     // Deal with the vec-3 case
    4368          42 :     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
    4369          14 :     auto Opcode = NumChannels->getZExtValue() == 3 ?
    4370          14 :       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
    4371             : 
    4372             :     SDValue Ops[] = {
    4373             :      Chain,
    4374          28 :      Op.getOperand(3),  // vdata
    4375          28 :      Op.getOperand(2),  // rsrc
    4376             :      VIndex,
    4377             :      VOffset,
    4378          28 :      Op.getOperand(6),  // soffset
    4379          28 :      Op.getOperand(7),  // inst_offset
    4380          28 :      Op.getOperand(8),  // dfmt
    4381          28 :      Op.getOperand(9),  // nfmt
    4382          28 :      Op.getOperand(12), // glc
    4383          28 :      Op.getOperand(13), // slc
    4384         126 :     };
    4385             : 
    4386             :     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
    4387             :            "Value of tfe other than zero is unsupported");
    4388             : 
    4389          42 :     EVT VT = Op.getOperand(3).getValueType();
    4390          56 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4391             :       MachinePointerInfo(),
    4392             :       MachineMemOperand::MOStore,
    4393          28 :       VT.getStoreSize(), 4);
    4394             :     return DAG.getMemIntrinsicNode(Opcode, DL,
    4395          28 :                                    Op->getVTList(), Ops, VT, MMO);
    4396             :   }
    4397             : 
    4398          32 :   case Intrinsic::amdgcn_tbuffer_store: {
    4399             :     SDValue Ops[] = {
    4400             :       Chain,
    4401          64 :       Op.getOperand(2),  // vdata
    4402          64 :       Op.getOperand(3),  // rsrc
    4403          64 :       Op.getOperand(4),  // vindex
    4404          64 :       Op.getOperand(5),  // voffset
    4405          64 :       Op.getOperand(6),  // soffset
    4406          64 :       Op.getOperand(7),  // offset
    4407          64 :       Op.getOperand(8),  // dfmt
    4408          64 :       Op.getOperand(9),  // nfmt
    4409          64 :       Op.getOperand(10), // glc
    4410          64 :       Op.getOperand(11)  // slc
    4411         352 :     };
    4412          96 :     EVT VT = Op.getOperand(3).getValueType();
    4413         128 :     MachineMemOperand *MMO = MF.getMachineMemOperand(
    4414             :       MachinePointerInfo(),
    4415             :       MachineMemOperand::MOStore,
    4416          64 :       VT.getStoreSize(), 4);
    4417             :     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
    4418          64 :                                    Op->getVTList(), Ops, VT, MMO);
    4419             :   }
    4420             : 
    4421        1268 :   default:
    4422        1268 :     return Op;
    4423             :   }
    4424             : }
    4425             : 
    4426       75121 : SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    4427      150242 :   SDLoc DL(Op);
    4428       75121 :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    4429       75121 :   ISD::LoadExtType ExtType = Load->getExtensionType();
    4430       75121 :   EVT MemVT = Load->getMemoryVT();
    4431             : 
    4432       75121 :   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
    4433       11702 :     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
    4434        2854 :       return SDValue();
    4435             : 
    4436             :     // FIXME: Copied from PPC
    4437             :     // First, load into 32 bits, then truncate to 1 bit.
    4438             : 
    4439         572 :     SDValue Chain = Load->getChain();
    4440         286 :     SDValue BasePtr = Load->getBasePtr();
    4441         286 :     MachineMemOperand *MMO = Load->getMemOperand();
    4442             : 
    4443         572 :     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
    4444             : 
    4445             :     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
    4446         286 :                                    BasePtr, RealMemVT, MMO);
    4447             : 
    4448             :     SDValue Ops[] = {
    4449         572 :       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
    4450             :       NewLD.getValue(1)
    4451        1144 :     };
    4452             : 
    4453         286 :     return DAG.getMergeValues(Ops, DL);
    4454             :   }
    4455             : 
    4456       71981 :   if (!MemVT.isVector())
    4457           0 :     return SDValue();
    4458             : 
    4459             :   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
    4460             :          "Custom lowering for non-i32 vectors hasn't been implemented.");
    4461             : 
    4462      143962 :   unsigned AS = Load->getAddressSpace();
    4463      215943 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
    4464             :                           AS, Load->getAlignment())) {
    4465           0 :     SDValue Ops[2];
    4466           0 :     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
    4467           0 :     return DAG.getMergeValues(Ops, DL);
    4468             :   }
    4469             : 
    4470       71981 :   MachineFunction &MF = DAG.getMachineFunction();
    4471       71981 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    4472             :   // If there is a possibilty that flat instruction access scratch memory
    4473             :   // then we need to use the same legalization rules we use for private.
    4474       71981 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    4475          24 :     AS = MFI->hasFlatScratchInit() ?
    4476             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    4477             : 
    4478       71981 :   unsigned NumElements = MemVT.getVectorNumElements();
    4479       71981 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
    4480       52509 :     if (isMemOpUniform(Load))
    4481       52282 :       return SDValue();
    4482             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    4483             :     // have the same legalization requirements as global and private
    4484             :     // loads.
    4485             :     //
    4486             :   }
    4487       19699 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
    4488       21536 :     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
    4489       16294 :         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
    4490         589 :       return SDValue();
    4491             :     // Non-uniform loads will be selected to MUBUF instructions, so they
    4492             :     // have the same legalization requirements as global and private
    4493             :     // loads.
    4494             :     //
    4495             :   }
    4496       26130 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
    4497        7020 :       AS == AMDGPUASI.FLAT_ADDRESS) {
    4498       12090 :     if (NumElements > 4)
    4499        1193 :       return SplitVectorLoad(Op, DAG);
    4500             :     // v4 loads are supported for private and global memory.
    4501       10897 :     return SDValue();
    4502             :   }
    4503        7020 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    4504             :     // Depending on the setting of the private_element_size field in the
    4505             :     // resource descriptor, we can only make private accesses up to a certain
    4506             :     // size.
    4507         347 :     switch (Subtarget->getMaxPrivateElementSize()) {
    4508         174 :     case 4:
    4509         174 :       return scalarizeVectorLoad(Load, DAG);
    4510          53 :     case 8:
    4511          53 :       if (NumElements > 2)
    4512           5 :         return SplitVectorLoad(Op, DAG);
    4513          48 :       return SDValue();
    4514         120 :     case 16:
    4515             :       // Same as global/flat
    4516         120 :       if (NumElements > 4)
    4517           1 :         return SplitVectorLoad(Op, DAG);
    4518         119 :       return SDValue();
    4519           0 :     default:
    4520           0 :       llvm_unreachable("unsupported private_element_size");
    4521             :     }
    4522        6673 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    4523        6673 :     if (NumElements > 2)
    4524         640 :       return SplitVectorLoad(Op, DAG);
    4525             : 
    4526        6033 :     if (NumElements == 2)
    4527        6033 :       return SDValue();
    4528             : 
    4529             :     // If properly aligned, if we split we might be able to use ds_read_b64.
    4530           0 :     return SplitVectorLoad(Op, DAG);
    4531             :   }
    4532           0 :   return SDValue();
    4533             : }
    4534             : 
    4535        1704 : SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    4536        3408 :   if (Op.getValueType() != MVT::i64)
    4537           0 :     return SDValue();
    4538             : 
    4539        1704 :   SDLoc DL(Op);
    4540        3408 :   SDValue Cond = Op.getOperand(0);
    4541             : 
    4542        1704 :   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
    4543        1704 :   SDValue One = DAG.getConstant(1, DL, MVT::i32);
    4544             : 
    4545        5112 :   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
    4546        5112 :   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
    4547             : 
    4548        3408 :   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
    4549        3408 :   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
    4550             : 
    4551        1704 :   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
    4552             : 
    4553        3408 :   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
    4554        3408 :   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
    4555             : 
    4556        1704 :   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
    4557             : 
    4558        5112 :   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
    4559        3408 :   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
    4560             : }
    4561             : 
    4562             : // Catch division cases where we can use shortcuts with rcp and rsq
    4563             : // instructions.
    4564         203 : SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
    4565             :                                               SelectionDAG &DAG) const {
    4566         406 :   SDLoc SL(Op);
    4567         406 :   SDValue LHS = Op.getOperand(0);
    4568         406 :   SDValue RHS = Op.getOperand(1);
    4569         406 :   EVT VT = Op.getValueType();
    4570         406 :   const SDNodeFlags Flags = Op->getFlags();
    4571         394 :   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
    4572         365 :                 Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
    4573             : 
    4574         212 :   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
    4575           7 :     return SDValue();
    4576             : 
    4577          69 :   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
    4578         137 :     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
    4579          69 :       if (CLHS->isExactlyValue(1.0)) {
    4580             :         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
    4581             :         // the CI documentation has a worst case error of 1 ulp.
    4582             :         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
    4583             :         // use it as long as we aren't trying to use denormals.
    4584             :         //
    4585             :         // v_rcp_f16 and v_rsq_f16 DO support denormals.
    4586             : 
    4587             :         // 1.0 / sqrt(x) -> rsq(x)
    4588             : 
    4589             :         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
    4590             :         // error seems really high at 2^29 ULP.
    4591         106 :         if (RHS.getOpcode() == ISD::FSQRT)
    4592          12 :           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
    4593             : 
    4594             :         // 1.0 / x -> rcp(x)
    4595          47 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    4596             :       }
    4597             : 
    4598             :       // Same as for 1.0, but expand the sign out of the constant.
    4599          16 :       if (CLHS->isExactlyValue(-1.0)) {
    4600             :         // -1.0 / x -> rcp (fneg x)
    4601          16 :         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    4602          16 :         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
    4603             :       }
    4604             :     }
    4605             :   }
    4606             : 
    4607         127 :   if (Unsafe) {
    4608             :     // Turn into multiply by the reciprocal.
    4609             :     // x / y -> x * (1.0 / y)
    4610          80 :     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
    4611          80 :     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
    4612             :   }
    4613             : 
    4614          47 :   return SDValue();
    4615             : }
    4616             : 
    4617          52 : static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    4618             :                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
    4619          52 :   if (GlueChain->getNumValues() <= 1) {
    4620           7 :     return DAG.getNode(Opcode, SL, VT, A, B);
    4621             :   }
    4622             : 
    4623             :   assert(GlueChain->getNumValues() == 3);
    4624             : 
    4625          90 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    4626          45 :   switch (Opcode) {
    4627           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    4628          45 :   case ISD::FMUL:
    4629          45 :     Opcode = AMDGPUISD::FMUL_W_CHAIN;
    4630             :     break;
    4631             :   }
    4632             : 
    4633             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
    4634         135 :                      GlueChain.getValue(2));
    4635             : }
    4636             : 
    4637         260 : static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
    4638             :                            EVT VT, SDValue A, SDValue B, SDValue C,
    4639             :                            SDValue GlueChain) {
    4640         260 :   if (GlueChain->getNumValues() <= 1) {
    4641          35 :     return DAG.getNode(Opcode, SL, VT, A, B, C);
    4642             :   }
    4643             : 
    4644             :   assert(GlueChain->getNumValues() == 3);
    4645             : 
    4646         450 :   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
    4647         225 :   switch (Opcode) {
    4648           0 :   default: llvm_unreachable("no chain equivalent for opcode");
    4649         225 :   case ISD::FMA:
    4650         225 :     Opcode = AMDGPUISD::FMA_W_CHAIN;
    4651             :     break;
    4652             :   }
    4653             : 
    4654             :   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
    4655         675 :                      GlueChain.getValue(2));
    4656             : }
    4657             : 
    4658          24 : SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
    4659          24 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    4660          22 :     return FastLowered;
    4661             : 
    4662           2 :   SDLoc SL(Op);
    4663           4 :   SDValue Src0 = Op.getOperand(0);
    4664           4 :   SDValue Src1 = Op.getOperand(1);
    4665             : 
    4666           4 :   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
    4667           4 :   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
    4668             : 
    4669           4 :   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
    4670           4 :   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
    4671             : 
    4672           4 :   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
    4673           4 :   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
    4674             : 
    4675           2 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
    4676             : }
    4677             : 
    4678             : // Faster 2.5 ULP division that does not support denormals.
    4679           4 : SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
    4680           8 :   SDLoc SL(Op);
    4681           8 :   SDValue LHS = Op.getOperand(1);
    4682           8 :   SDValue RHS = Op.getOperand(2);
    4683             : 
    4684           8 :   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
    4685             : 
    4686           8 :   const APFloat K0Val(BitsToFloat(0x6f800000));
    4687           4 :   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
    4688             : 
    4689           8 :   const APFloat K1Val(BitsToFloat(0x2f800000));
    4690           4 :   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
    4691             : 
    4692           4 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    4693             : 
    4694             :   EVT SetCCVT =
    4695           8 :     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
    4696             : 
    4697           4 :   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
    4698             : 
    4699           4 :   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
    4700             : 
    4701             :   // TODO: Should this propagate fast-math-flags?
    4702           8 :   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
    4703             : 
    4704             :   // rcp does not support denormals.
    4705           8 :   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
    4706             : 
    4707           8 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
    4708             : 
    4709          12 :   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
    4710             : }
    4711             : 
    4712         172 : SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
    4713         172 :   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    4714         120 :     return FastLowered;
    4715             : 
    4716          52 :   SDLoc SL(Op);
    4717         104 :   SDValue LHS = Op.getOperand(0);
    4718         104 :   SDValue RHS = Op.getOperand(1);
    4719             : 
    4720          52 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
    4721             : 
    4722         104 :   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
    4723             : 
    4724             :   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    4725          52 :                                           RHS, RHS, LHS);
    4726             :   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
    4727          52 :                                         LHS, RHS, LHS);
    4728             : 
    4729             :   // Denominator is scaled to not be denormal, so using rcp is ok.
    4730             :   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
    4731         104 :                                   DenominatorScaled);
    4732             :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
    4733         104 :                                      DenominatorScaled);
    4734             : 
    4735          52 :   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
    4736             :                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
    4737             :                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
    4738             : 
    4739         104 :   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
    4740             : 
    4741          52 :   if (!Subtarget->hasFP32Denormals()) {
    4742          90 :     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
    4743             :     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
    4744          45 :                                                       SL, MVT::i32);
    4745             :     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
    4746             :                                        DAG.getEntryNode(),
    4747          45 :                                        EnableDenormValue, BitField);
    4748             :     SDValue Ops[3] = {
    4749             :       NegDivScale0,
    4750             :       EnableDenorm.getValue(0),
    4751             :       EnableDenorm.getValue(1)
    4752         135 :     };
    4753             : 
    4754          45 :     NegDivScale0 = DAG.getMergeValues(Ops, SL);
    4755             :   }
    4756             : 
    4757             :   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
    4758          52 :                              ApproxRcp, One, NegDivScale0);
    4759             : 
    4760             :   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
    4761          52 :                              ApproxRcp, Fma0);
    4762             : 
    4763             :   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
    4764          52 :                            Fma1, Fma1);
    4765             : 
    4766             :   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
    4767          52 :                              NumeratorScaled, Mul);
    4768             : 
    4769          52 :   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
    4770             : 
    4771             :   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
    4772          52 :                              NumeratorScaled, Fma3);
    4773             : 
    4774          52 :   if (!Subtarget->hasFP32Denormals()) {
    4775             :     const SDValue DisableDenormValue =
    4776          45 :         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
    4777             :     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
    4778             :                                         Fma4.getValue(1),
    4779             :                                         DisableDenormValue,
    4780             :                                         BitField,
    4781         180 :                                         Fma4.getValue(2));
    4782             : 
    4783             :     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
    4784         135 :                                       DisableDenorm, DAG.getRoot());
    4785          45 :     DAG.setRoot(OutputChain);
    4786             :   }
    4787             : 
    4788         104 :   SDValue Scale = NumeratorScaled.getValue(1);
    4789             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
    4790          52 :                              Fma4, Fma1, Fma3, Scale);
    4791             : 
    4792          52 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
    4793             : }
    4794             : 
    4795          68 : SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
    4796          68 :   if (DAG.getTarget().Options.UnsafeFPMath)
    4797           7 :     return lowerFastUnsafeFDIV(Op, DAG);
    4798             : 
    4799          61 :   SDLoc SL(Op);
    4800         122 :   SDValue X = Op.getOperand(0);
    4801         122 :   SDValue Y = Op.getOperand(1);
    4802             : 
    4803          61 :   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
    4804             : 
    4805         122 :   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
    4806             : 
    4807          61 :   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
    4808             : 
    4809         122 :   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
    4810             : 
    4811         122 :   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
    4812             : 
    4813          61 :   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
    4814             : 
    4815          61 :   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
    4816             : 
    4817          61 :   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
    4818             : 
    4819          61 :   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
    4820             : 
    4821          61 :   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
    4822         122 :   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
    4823             : 
    4824             :   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
    4825          61 :                              NegDivScale0, Mul, DivScale1);
    4826             : 
    4827          61 :   SDValue Scale;
    4828             : 
    4829          61 :   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
    4830             :     // Workaround a hardware bug on SI where the condition output from div_scale
    4831             :     // is not usable.
    4832             : 
    4833          23 :     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
    4834             : 
    4835             :     // Figure out if the scale to use for div_fmas.
    4836          46 :     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
    4837          46 :     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
    4838          46 :     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
    4839          46 :     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
    4840             : 
    4841          46 :     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
    4842          46 :     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
    4843             : 
    4844             :     SDValue Scale0Hi
    4845          46 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
    4846             :     SDValue Scale1Hi
    4847          46 :       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
    4848             : 
    4849          23 :     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
    4850          23 :     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
    4851          46 :     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
    4852             :   } else {
    4853          76 :     Scale = DivScale1.getValue(1);
    4854             :   }
    4855             : 
    4856             :   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
    4857          61 :                              Fma4, Fma3, Mul, Scale);
    4858             : 
    4859          61 :   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
    4860             : }
    4861             : 
    4862         264 : SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
    4863         528 :   EVT VT = Op.getValueType();
    4864             : 
    4865         528 :   if (VT == MVT::f32)
    4866         172 :     return LowerFDIV32(Op, DAG);
    4867             : 
    4868         184 :   if (VT == MVT::f64)
    4869          68 :     return LowerFDIV64(Op, DAG);
    4870             : 
    4871          48 :   if (VT == MVT::f16)
    4872          24 :     return LowerFDIV16(Op, DAG);
    4873             : 
    4874           0 :   llvm_unreachable("Unexpected type for fdiv");
    4875             : }
    4876             : 
    4877       62784 : SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    4878      125568 :   SDLoc DL(Op);
    4879       62784 :   StoreSDNode *Store = cast<StoreSDNode>(Op);
    4880       62784 :   EVT VT = Store->getMemoryVT();
    4881             : 
    4882      125568 :   if (VT == MVT::i1) {
    4883         414 :     return DAG.getTruncStore(Store->getChain(), DL,
    4884         414 :        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
    4885        1035 :        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    4886             :   }
    4887             : 
    4888             :   assert(VT.isVector() &&
    4889             :          Store->getValue().getValueType().getScalarType() == MVT::i32);
    4890             : 
    4891      125154 :   unsigned AS = Store->getAddressSpace();
    4892      187731 :   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
    4893             :                           AS, Store->getAlignment())) {
    4894          24 :     return expandUnalignedStore(Store, DAG);
    4895             :   }
    4896             : 
    4897       62553 :   MachineFunction &MF = DAG.getMachineFunction();
    4898       62553 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    4899             :   // If there is a possibilty that flat instruction access scratch memory
    4900             :   // then we need to use the same legalization rules we use for private.
    4901       62553 :   if (AS == AMDGPUASI.FLAT_ADDRESS)
    4902         244 :     AS = MFI->hasFlatScratchInit() ?
    4903             :          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
    4904             : 
    4905       62553 :   unsigned NumElements = VT.getVectorNumElements();
    4906       62553 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
    4907             :       AS == AMDGPUASI.FLAT_ADDRESS) {
    4908       39135 :     if (NumElements > 4)
    4909        3806 :       return SplitVectorStore(Op, DAG);
    4910       35329 :     return SDValue();
    4911       23418 :   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
    4912         559 :     switch (Subtarget->getMaxPrivateElementSize()) {
    4913         279 :     case 4:
    4914         279 :       return scalarizeVectorStore(Store, DAG);
    4915         126 :     case 8:
    4916         126 :       if (NumElements > 2)
    4917          10 :         return SplitVectorStore(Op, DAG);
    4918         116 :       return SDValue();
    4919         154 :     case 16:
    4920         154 :       if (NumElements > 4)
    4921           2 :         return SplitVectorStore(Op, DAG);
    4922         152 :       return SDValue();
    4923           0 :     default:
    4924           0 :       llvm_unreachable("unsupported private_element_size");
    4925             :     }
    4926       22859 :   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
    4927       22859 :     if (NumElements > 2)
    4928        2107 :       return SplitVectorStore(Op, DAG);
    4929             : 
    4930       20752 :     if (NumElements == 2)
    4931       20752 :       return Op;
    4932             : 
    4933             :     // If properly aligned, if we split we might be able to use ds_write_b64.
    4934           0 :     return SplitVectorStore(Op, DAG);
    4935             :   } else {
    4936           0 :     llvm_unreachable("unhandled address space");
    4937             :   }
    4938             : }
    4939             : 
    4940          51 : SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    4941         102 :   SDLoc DL(Op);
    4942         102 :   EVT VT = Op.getValueType();
    4943         102 :   SDValue Arg = Op.getOperand(0);
    4944             :   // TODO: Should this propagate fast-math-flags?
    4945             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    4946             :                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
    4947             :                                               DAG.getConstantFP(0.5/M_PI, DL,
    4948         102 :                                                                 VT)));
    4949             : 
    4950         102 :   switch (Op.getOpcode()) {
    4951          24 :   case ISD::FCOS:
    4952          72 :     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
    4953          27 :   case ISD::FSIN:
    4954          81 :     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
    4955           0 :   default:
    4956           0 :     llvm_unreachable("Wrong trig opcode");
    4957             :   }
    4958             : }
    4959             : 
    4960         251 : SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    4961         251 :   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
    4962             :   assert(AtomicNode->isCompareAndSwap());
    4963         502 :   unsigned AS = AtomicNode->getAddressSpace();
    4964             : 
    4965             :   // No custom lowering required for local address space
    4966         307 :   if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
    4967          56 :     return Op;
    4968             : 
    4969             :   // Non-local address space requires custom lowering for atomic compare
    4970             :   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
    4971         195 :   SDLoc DL(Op);
    4972         390 :   SDValue ChainIn = Op.getOperand(0);
    4973         390 :   SDValue Addr = Op.getOperand(1);
    4974         390 :   SDValue Old = Op.getOperand(2);
    4975         390 :   SDValue New = Op.getOperand(3);
    4976         390 :   EVT VT = Op.getValueType();
    4977         195 :   MVT SimpleVT = VT.getSimpleVT();
    4978         195 :   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
    4979             : 
    4980         585 :   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
    4981         195 :   SDValue Ops[] = { ChainIn, Addr, NewOld };
    4982             : 
    4983             :   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
    4984         585 :                                  Ops, VT, AtomicNode->getMemOperand());
    4985             : }
    4986             : 
    4987             : //===----------------------------------------------------------------------===//
    4988             : // Custom DAG optimizations
    4989             : //===----------------------------------------------------------------------===//
    4990             : 
    4991        1002 : SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
    4992             :                                                      DAGCombinerInfo &DCI) const {
    4993        2004 :   EVT VT = N->getValueType(0);
    4994        1002 :   EVT ScalarVT = VT.getScalarType();
    4995        1002 :   if (ScalarVT != MVT::f32)
    4996         234 :     return SDValue();
    4997             : 
    4998         768 :   SelectionDAG &DAG = DCI.DAG;
    4999         768 :   SDLoc DL(N);
    5000             : 
    5001        1536 :   SDValue Src = N->getOperand(0);
    5002        1536 :   EVT SrcVT = Src.getValueType();
    5003             : 
    5004             :   // TODO: We could try to match extracting the higher bytes, which would be
    5005             :   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
    5006             :   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
    5007             :   // about in practice.
    5008        1066 :   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
    5009         580 :     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
    5010         106 :       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
    5011         106 :       DCI.AddToWorklist(Cvt.getNode());
    5012         106 :       return Cvt;
    5013             :     }
    5014             :   }
    5015             : 
    5016         662 :   return SDValue();
    5017             : }
    5018             : 
    5019             : /// \brief Return true if the given offset Size in bytes can be folded into
    5020             : /// the immediate offsets of a memory instruction for the given address space.
    5021           0 : static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
    5022             :                           const SISubtarget &STI) {
    5023           0 :   auto AMDGPUASI = STI.getAMDGPUAS();
    5024           0 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
    5025             :     // MUBUF instructions a 12-bit offset in bytes.
    5026           0 :     return isUInt<12>(OffsetSize);
    5027             :   }
    5028           0 :   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
    5029             :     // SMRD instructions have an 8-bit offset in dwords on SI and
    5030             :     // a 20-bit offset in bytes on VI.
    5031           0 :     if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    5032           0 :       return isUInt<20>(OffsetSize);
    5033             :     else
    5034           0 :       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
    5035             :   }
    5036           0 :   if (AS == AMDGPUASI.LOCAL_ADDRESS ||
    5037             :       AS == AMDGPUASI.REGION_ADDRESS) {
    5038             :     // The single offset versions have a 16-bit offset in bytes.
    5039           0 :     return isUInt<16>(OffsetSize);
    5040             :   }
    5041             :   // Indirect register addressing does not use any offsets.
    5042             :   return false;
    5043             : }
    5044             : 
    5045             : // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
    5046             : 
    5047             : // This is a variant of
    5048             : // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
    5049             : //
    5050             : // The normal DAG combiner will do this, but only if the add has one use since
    5051             : // that would increase the number of instructions.
    5052             : //
    5053             : // This prevents us from seeing a constant offset that can be folded into a
    5054             : // memory instruction's addressing mode. If we know the resulting add offset of
    5055             : // a pointer can be folded into an addressing offset, we can replace the pointer
    5056             : // operand with the add of new constant offset. This eliminates one of the uses,
    5057             : // and may allow the remaining use to also be simplified.
    5058             : //
    5059         111 : SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    5060             :                                                unsigned AddrSpace,
    5061             :                                                DAGCombinerInfo &DCI) const {
    5062         222 :   SDValue N0 = N->getOperand(0);
    5063         222 :   SDValue N1 = N->getOperand(1);
    5064             : 
    5065         222 :   if (N0.getOpcode() != ISD::ADD)
    5066          69 :     return SDValue();
    5067             : 
    5068          42 :   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
    5069             :   if (!CN1)
    5070           0 :     return SDValue();
    5071             : 
    5072          84 :   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    5073             :   if (!CAdd)
    5074          42 :     return SDValue();
    5075             : 
    5076             :   // If the resulting offset is too large, we can't fold it into the addressing
    5077             :   // mode offset.
    5078           0 :   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    5079           0 :   if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
    5080           0 :     return SDValue();
    5081             : 
    5082           0 :   SelectionDAG &DAG = DCI.DAG;
    5083           0 :   SDLoc SL(N);
    5084           0 :   EVT VT = N->getValueType(0);
    5085             : 
    5086           0 :   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
    5087           0 :   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
    5088             : 
    5089           0 :   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
    5090             : }
    5091             : 
    5092      321025 : SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
    5093             :                                                   DAGCombinerInfo &DCI) const {
    5094      321025 :   SDValue Ptr = N->getBasePtr();
    5095      321025 :   SelectionDAG &DAG = DCI.DAG;
    5096      642050 :   SDLoc SL(N);
    5097             : 
    5098             :   // TODO: We could also do this for multiplies.
    5099      321025 :   unsigned AS = N->getAddressSpace();
    5100      642050 :   if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
    5101         111 :     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
    5102         111 :     if (NewPtr) {
    5103           0 :       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
    5104             : 
    5105           0 :       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
    5106           0 :       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
    5107             :     }
    5108             :   }
    5109             : 
    5110      321025 :   return SDValue();
    5111             : }
    5112             : 
    5113             : static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
    5114        3839 :   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
    5115        4498 :          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
    5116        1719 :          (Opc == ISD::XOR && Val == 0);
    5117             : }
    5118             : 
    5119             : // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
    5120             : // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
    5121             : // integer combine opportunities since most 64-bit operations are decomposed
    5122             : // this way.  TODO: We won't want this for SALU especially if it is an inline
    5123             : // immediate.
    5124        1548 : SDValue SITargetLowering::splitBinaryBitConstantOp(
    5125             :   DAGCombinerInfo &DCI,
    5126             :   const SDLoc &SL,
    5127             :   unsigned Opc, SDValue LHS,
    5128             :   const ConstantSDNode *CRHS) const {
    5129        3096 :   uint64_t Val = CRHS->getZExtValue();
    5130        1548 :   uint32_t ValLo = Lo_32(Val);
    5131        1548 :   uint32_t ValHi = Hi_32(Val);
    5132        3096 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    5133             : 
    5134        1231 :     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
    5135         466 :          bitOpWithConstantIsReducible(Opc, ValHi)) ||
    5136         678 :         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
    5137             :     // If we need to materialize a 64-bit immediate, it will be split up later
    5138             :     // anyway. Avoid creating the harder to understand 64-bit immediate
    5139             :     // materialization.
    5140        1130 :     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
    5141             :   }
    5142             : 
    5143         418 :   return SDValue();
    5144             : }
    5145             : 
    5146             : // Returns true if argument is a boolean value which is not serialized into
    5147             : // memory or argument and does not require v_cmdmask_b32 to be deserialized.
    5148             : static bool isBoolSGPR(SDValue V) {
    5149          85 :   if (V.getValueType() != MVT::i1)
    5150             :     return false;
    5151          72 :   switch (V.getOpcode()) {
    5152             :   default: break;
    5153             :   case ISD::SETCC:
    5154             :   case ISD::AND:
    5155             :   case ISD::OR:
    5156             :   case ISD::XOR:
    5157             :   case AMDGPUISD::FP_CLASS:
    5158             :     return true;
    5159             :   }
    5160             :   return false;
    5161             : }
    5162             : 
    5163       27906 : SDValue SITargetLowering::performAndCombine(SDNode *N,
    5164             :                                             DAGCombinerInfo &DCI) const {
    5165       27906 :   if (DCI.isBeforeLegalize())
    5166         707 :     return SDValue();
    5167             : 
    5168       27199 :   SelectionDAG &DAG = DCI.DAG;
    5169       54398 :   EVT VT = N->getValueType(0);
    5170       54398 :   SDValue LHS = N->getOperand(0);
    5171       54398 :   SDValue RHS = N->getOperand(1);
    5172             : 
    5173             : 
    5174       27199 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    5175       28518 :   if (VT == MVT::i64 && CRHS) {
    5176        1133 :     if (SDValue Split
    5177        2266 :         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
    5178        1053 :       return Split;
    5179             :   }
    5180             : 
    5181       51711 :   if (CRHS && VT == MVT::i32) {
    5182             :     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
    5183             :     // nb = number of trailing zeroes in mask
    5184             :     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
    5185             :     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
    5186       22105 :     uint64_t Mask = CRHS->getZExtValue();
    5187       22105 :     unsigned Bits = countPopulation(Mask);
    5188       30118 :     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
    5189       24315 :         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
    5190         213 :       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
    5191          71 :         unsigned Shift = CShift->getZExtValue();
    5192          71 :         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
    5193          71 :         unsigned Offset = NB + Shift;
    5194          71 :         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
    5195         142 :           SDLoc SL(N);
    5196             :           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
    5197         142 :                                     LHS->getOperand(0),
    5198          71 :                                     DAG.getConstant(Offset, SL, MVT::i32),
    5199         213 :                                     DAG.getConstant(Bits, SL, MVT::i32));
    5200          71 :           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
    5201             :           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
    5202          71 :                                     DAG.getValueType(NarrowVT));
    5203         142 :           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
    5204         355 :                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
    5205          71 :           return Shl;
    5206             :         }
    5207             :       }
    5208             :     }
    5209             :   }
    5210             : 
    5211             :   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
    5212             :   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
    5213       52239 :   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
    5214         267 :     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
    5215         267 :     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
    5216             : 
    5217         178 :     SDValue X = LHS.getOperand(0);
    5218         178 :     SDValue Y = RHS.getOperand(0);
    5219         196 :     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
    5220          81 :       return SDValue();
    5221             : 
    5222           8 :     if (LCC == ISD::SETO) {
    5223          18 :       if (X != LHS.getOperand(1))
    5224           0 :         return SDValue();
    5225             : 
    5226           6 :       if (RCC == ISD::SETUNE) {
    5227          12 :         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
    5228           8 :         if (!C1 || !C1->isInfinity() || C1->isNegative())
    5229           2 :           return SDValue();
    5230             : 
    5231           2 :         const uint32_t Mask = SIInstrFlags::N_NORMAL |
    5232             :                               SIInstrFlags::N_SUBNORMAL |
    5233             :                               SIInstrFlags::N_ZERO |
    5234             :                               SIInstrFlags::P_ZERO |
    5235             :                               SIInstrFlags::P_SUBNORMAL |
    5236             :                               SIInstrFlags::P_NORMAL;
    5237             : 
    5238             :         static_assert(((~(SIInstrFlags::S_NAN |
    5239             :                           SIInstrFlags::Q_NAN |
    5240             :                           SIInstrFlags::N_INFINITY |
    5241             :                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
    5242             :                       "mask not equal");
    5243             : 
    5244           2 :         SDLoc DL(N);
    5245             :         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    5246           6 :                            X, DAG.getConstant(Mask, DL, MVT::i32));
    5247             :       }
    5248             :     }
    5249             :   }
    5250             : 
    5251       48293 :   if (VT == MVT::i32 &&
    5252       66907 :       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
    5253             :     // and x, (sext cc from i1) => select cc, x, 0
    5254          22 :     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
    5255             :       std::swap(LHS, RHS);
    5256          33 :     if (isBoolSGPR(RHS.getOperand(0)))
    5257          44 :       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
    5258          66 :                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
    5259             :   }
    5260             : 
    5261       25979 :   return SDValue();
    5262             : }
    5263             : 
    5264       16821 : SDValue SITargetLowering::performOrCombine(SDNode *N,
    5265             :                                            DAGCombinerInfo &DCI) const {
    5266       16821 :   SelectionDAG &DAG = DCI.DAG;
    5267       33642 :   SDValue LHS = N->getOperand(0);
    5268       33642 :   SDValue RHS = N->getOperand(1);
    5269             : 
    5270       33642 :   EVT VT = N->getValueType(0);
    5271       33642 :   if (VT == MVT::i1) {
    5272             :     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
    5273         152 :     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
    5274          14 :         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
    5275          28 :       SDValue Src = LHS.getOperand(0);
    5276          42 :       if (Src != RHS.getOperand(0))
    5277           1 :         return SDValue();
    5278             : 
    5279          39 :       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    5280          39 :       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    5281          13 :       if (!CLHS || !CRHS)
    5282           0 :         return SDValue();
    5283             : 
    5284             :       // Only 10 bits are used.
    5285             :       static const uint32_t MaxMask = 0x3ff;
    5286             : 
    5287          26 :       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
    5288          13 :       SDLoc DL(N);
    5289             :       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
    5290          39 :                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
    5291             :     }
    5292             : 
    5293          55 :     return SDValue();
    5294             :   }
    5295             : 
    5296       16752 :   if (VT != MVT::i64)
    5297       14146 :     return SDValue();
    5298             : 
    5299             :   // TODO: This could be a generic combine with a predicate for extracting the
    5300             :   // high half of an integer being free.
    5301             : 
    5302             :   // (or i64:x, (zero_extend i32:y)) ->
    5303             :   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
    5304        5729 :   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
    5305         517 :       RHS.getOpcode() != ISD::ZERO_EXTEND)
    5306             :     std::swap(LHS, RHS);
    5307             : 
    5308        5212 :   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
    5309        3400 :     SDValue ExtSrc = RHS.getOperand(0);
    5310        3400 :     EVT SrcVT = ExtSrc.getValueType();
    5311        1700 :     if (SrcVT == MVT::i32) {
    5312        3400 :       SDLoc SL(N);
    5313             :       SDValue LowLHS, HiBits;
    5314        5100 :       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
    5315        3400 :       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
    5316             : 
    5317        1700 :       DCI.AddToWorklist(LowOr.getNode());
    5318        1700 :       DCI.AddToWorklist(HiBits.getNode());
    5319             : 
    5320             :       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
    5321        3400 :                                 LowOr, HiBits);
    5322        3400 :       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
    5323             :     }
    5324             :   }
    5325             : 
    5326        1973 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
    5327             :   if (CRHS) {
    5328         161 :     if (SDValue Split
    5329         322 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
    5330          51 :       return Split;
    5331             :   }
    5332             : 
    5333         855 :   return SDValue();
    5334             : }
    5335             : 
    5336        1113 : SDValue SITargetLowering::performXorCombine(SDNode *N,
    5337             :                                             DAGCombinerInfo &DCI) const {
    5338        2226 :   EVT VT = N->getValueType(0);
    5339        1113 :   if (VT != MVT::i64)
    5340         657 :     return SDValue();
    5341             : 
    5342         912 :   SDValue LHS = N->getOperand(0);
    5343         912 :   SDValue RHS = N->getOperand(1);
    5344             : 
    5345         254 :   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
    5346             :   if (CRHS) {
    5347         254 :     if (SDValue Split
    5348         508 :           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
    5349          26 :       return Split;
    5350             :   }
    5351             : 
    5352         430 :   return SDValue();
    5353             : }
    5354             : 
    5355             : // Instructions that will be lowered with a final instruction that zeros the
    5356             : // high result bits.
    5357             : // XXX - probably only need to list legal operations.
    5358         194 : static bool fp16SrcZerosHighBits(unsigned Opc) {
    5359         194 :   switch (Opc) {
    5360             :   case ISD::FADD:
    5361             :   case ISD::FSUB:
    5362             :   case ISD::FMUL:
    5363             :   case ISD::FDIV:
    5364             :   case ISD::FREM:
    5365             :   case ISD::FMA:
    5366             :   case ISD::FMAD:
    5367             :   case ISD::FCANONICALIZE:
    5368             :   case ISD::FP_ROUND:
    5369             :   case ISD::UINT_TO_FP:
    5370             :   case ISD::SINT_TO_FP:
    5371             :   case ISD::FABS:
    5372             :     // Fabs is lowered to a bit operation, but it's an and which will clear the
    5373             :     // high bits anyway.
    5374             :   case ISD::FSQRT:
    5375             :   case ISD::FSIN:
    5376             :   case ISD::FCOS:
    5377             :   case ISD::FPOWI:
    5378             :   case ISD::FPOW:
    5379             :   case ISD::FLOG:
    5380             :   case ISD::FLOG2:
    5381             :   case ISD::FLOG10:
    5382             :   case ISD::FEXP:
    5383             :   case ISD::FEXP2:
    5384             :   case ISD::FCEIL:
    5385             :   case ISD::FTRUNC:
    5386             :   case ISD::FRINT:
    5387             :   case ISD::FNEARBYINT:
    5388             :   case ISD::FROUND:
    5389             :   case ISD::FFLOOR:
    5390             :   case ISD::FMINNUM:
    5391             :   case ISD::FMAXNUM:
    5392             :   case AMDGPUISD::FRACT:
    5393             :   case AMDGPUISD::CLAMP:
    5394             :   case AMDGPUISD::COS_HW:
    5395             :   case AMDGPUISD::SIN_HW:
    5396             :   case AMDGPUISD::FMIN3:
    5397             :   case AMDGPUISD::FMAX3:
    5398             :   case AMDGPUISD::FMED3:
    5399             :   case AMDGPUISD::FMAD_FTZ:
    5400             :   case AMDGPUISD::RCP:
    5401             :   case AMDGPUISD::RSQ:
    5402             :   case AMDGPUISD::LDEXP:
    5403             :     return true;
    5404          39 :   default:
    5405             :     // fcopysign, select and others may be lowered to 32-bit bit operations
    5406             :     // which don't zero the high bits.
    5407          39 :     return false;
    5408             :   }
    5409             : }
    5410             : 
    5411       12685 : SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
    5412             :                                                    DAGCombinerInfo &DCI) const {
    5413       21082 :   if (!Subtarget->has16BitInsts() ||
    5414        8397 :       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    5415        9895 :     return SDValue();
    5416             : 
    5417        5580 :   EVT VT = N->getValueType(0);
    5418        2790 :   if (VT != MVT::i32)
    5419        1470 :     return SDValue();
    5420             : 
    5421        2640 :   SDValue Src = N->getOperand(0);
    5422        2640 :   if (Src.getValueType() != MVT::i16)
    5423         168 :     return SDValue();
    5424             : 
    5425             :   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
    5426             :   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
    5427        2304 :   if (Src.getOpcode() == ISD::BITCAST) {
    5428         388 :     SDValue BCSrc = Src.getOperand(0);
    5429         582 :     if (BCSrc.getValueType() == MVT::f16 &&
    5430         388 :         fp16SrcZerosHighBits(BCSrc.getOpcode()))
    5431         620 :       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
    5432             :   }
    5433             : 
    5434         997 :   return SDValue();
    5435             : }
    5436             : 
    5437          74 : SDValue SITargetLowering::performClassCombine(SDNode *N,
    5438             :                                               DAGCombinerInfo &DCI) const {
    5439          74 :   SelectionDAG &DAG = DCI.DAG;
    5440         148 :   SDValue Mask = N->getOperand(1);
    5441             : 
    5442             :   // fp_class x, 0 -> false
    5443          51 :   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
    5444          51 :     if (CMask->isNullValue())
    5445           6 :       return DAG.getConstant(0, SDLoc(N), MVT::i1);
    5446             :   }
    5447             : 
    5448         216 :   if (N->getOperand(0).isUndef())
    5449           2 :     return DAG.getUNDEF(MVT::i1);
    5450             : 
    5451          70 :   return SDValue();
    5452             : }
    5453             : 
    5454             : static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
    5455          61 :   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
    5456             :     return true;
    5457             : 
    5458          29 :   return DAG.isKnownNeverNaN(Op);
    5459             : }
    5460             : 
    5461         413 : static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
    5462             :                             const SISubtarget *ST, unsigned MaxDepth=5) {
    5463             :   // If source is a result of another standard FP operation it is already in
    5464             :   // canonical form.
    5465             : 
    5466         826 :   switch (Op.getOpcode()) {
    5467             :   default:
    5468             :     break;
    5469             : 
    5470             :   // These will flush denorms if required.
    5471             :   case ISD::FADD:
    5472             :   case ISD::FSUB:
    5473             :   case ISD::FMUL:
    5474             :   case ISD::FSQRT:
    5475             :   case ISD::FCEIL:
    5476             :   case ISD::FFLOOR:
    5477             :   case ISD::FMA:
    5478             :   case ISD::FMAD:
    5479             : 
    5480             :   case ISD::FCANONICALIZE:
    5481             :     return true;
    5482             : 
    5483          20 :   case ISD::FP_ROUND:
    5484          56 :     return Op.getValueType().getScalarType() != MVT::f16 ||
    5485          16 :            ST->hasFP16Denormals();
    5486             : 
    5487           8 :   case ISD::FP_EXTEND:
    5488          28 :     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
    5489           4 :            ST->hasFP16Denormals();
    5490             : 
    5491           0 :   case ISD::FP16_TO_FP:
    5492             :   case ISD::FP_TO_FP16:
    5493           0 :     return ST->hasFP16Denormals();
    5494             : 
    5495             :   // It can/will be lowered or combined as a bit operation.
    5496             :   // Need to check their input recursively to handle.
    5497          68 :   case ISD::FNEG:
    5498             :   case ISD::FABS:
    5499         136 :     return (MaxDepth > 0) &&
    5500         136 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
    5501             : 
    5502          16 :   case ISD::FSIN:
    5503             :   case ISD::FCOS:
    5504             :   case ISD::FSINCOS:
    5505          48 :     return Op.getValueType().getScalarType() != MVT::f16;
    5506             : 
    5507             :   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
    5508             :   // For such targets need to check their input recursively.
    5509          44 :   case ISD::FMINNUM:
    5510             :   case ISD::FMAXNUM:
    5511             :   case ISD::FMINNAN:
    5512             :   case ISD::FMAXNAN:
    5513             : 
    5514         110 :     if (ST->supportsMinMaxDenormModes() &&
    5515          66 :         DAG.isKnownNeverNaN(Op.getOperand(0)) &&
    5516           0 :         DAG.isKnownNeverNaN(Op.getOperand(1)))
    5517             :       return true;
    5518             : 
    5519          44 :     return (MaxDepth > 0) &&
    5520         100 :            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
    5521          24 :            isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
    5522             : 
    5523          12 :   case ISD::ConstantFP: {
    5524          48 :     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
    5525          24 :     return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
    5526             :   }
    5527             :   }
    5528             :   return false;
    5529             : }
    5530             : 
    5531             : // Constant fold canonicalize.
    5532         401 : SDValue SITargetLowering::performFCanonicalizeCombine(
    5533             :   SDNode *N,
    5534             :   DAGCombinerInfo &DCI) const {
    5535         401 :   SelectionDAG &DAG = DCI.DAG;
    5536         802 :   ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
    5537             : 
    5538         401 :   if (!CFP) {
    5539         602 :     SDValue N0 = N->getOperand(0);
    5540         602 :     EVT VT = N0.getValueType().getScalarType();
    5541         301 :     auto ST = getSubtarget();
    5542             : 
    5543         469 :     if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
    5544         290 :          (VT == MVT::f64 && ST->hasFP64Denormals()) ||
    5545         501 :          (VT == MVT::f16 && ST->hasFP16Denormals())) &&
    5546         160 :         DAG.isKnownNeverNaN(N0))
    5547          10 :       return N0;
    5548             : 
    5549         582 :     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
    5550             : 
    5551         583 :     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
    5552         289 :         isCanonicalized(DAG, N0, ST))
    5553          94 :       return N0;
    5554             : 
    5555         197 :     return SDValue();
    5556             :   }
    5557             : 
    5558         200 :   const APFloat &C = CFP->getValueAPF();
    5559             : 
    5560             :   // Flush denormals to 0 if not enabled.
    5561         100 :   if (C.isDenormal()) {
    5562          48 :     EVT VT = N->getValueType(0);
    5563          24 :     EVT SVT = VT.getScalarType();
    5564          28 :     if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
    5565           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    5566             : 
    5567          26 :     if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
    5568           4 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    5569             : 
    5570          36 :     if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
    5571           0 :       return DAG.getConstantFP(0.0, SDLoc(N), VT);
    5572             :   }
    5573             : 
    5574          96 :   if (C.isNaN()) {
    5575          84 :     EVT VT = N->getValueType(0);
    5576          90 :     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
    5577          42 :     if (C.isSignaling()) {
    5578             :       // Quiet a signaling NaN.
    5579          44 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    5580             :     }
    5581             : 
    5582             :     // Make sure it is the canonical NaN bitpattern.
    5583             :     //
    5584             :     // TODO: Can we use -1 as the canonical NaN value since it's an inline
    5585             :     // immediate?
    5586          80 :     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
    5587          28 :       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
    5588             :   }
    5589             : 
    5590         120 :   return N->getOperand(0);
    5591             : }
    5592             : 
    5593             : static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
    5594          51 :   switch (Opc) {
    5595             :   case ISD::FMAXNUM:
    5596             :     return AMDGPUISD::FMAX3;
    5597           5 :   case ISD::SMAX:
    5598             :     return AMDGPUISD::SMAX3;
    5599           5 :   case ISD::UMAX:
    5600             :     return AMDGPUISD::UMAX3;
    5601          12 :   case ISD::FMINNUM:
    5602             :     return AMDGPUISD::FMIN3;
    5603           8 :   case ISD::SMIN:
    5604             :     return AMDGPUISD::SMIN3;
    5605           5 :   case ISD::UMIN:
    5606             :     return AMDGPUISD::UMIN3;
    5607           0 :   default:
    5608           0 :     llvm_unreachable("Not a min/max opcode");
    5609             :   }
    5610             : }
    5611             : 
    5612         152 : SDValue SITargetLowering::performIntMed3ImmCombine(
    5613             :   SelectionDAG &DAG, const SDLoc &SL,
    5614             :   SDValue Op0, SDValue Op1, bool Signed) const {
    5615          60 :   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
    5616             :   if (!K1)
    5617          92 :     return SDValue();
    5618             : 
    5619         177 :   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
    5620             :   if (!K0)
    5621           3 :     return SDValue();
    5622             : 
    5623          57 :   if (Signed) {
    5624          96 :     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
    5625           3 :       return SDValue();
    5626             :   } else {
    5627          18 :     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
    5628           3 :       return SDValue();
    5629             :   }
    5630             : 
    5631         102 :   EVT VT = K0->getValueType(0);
    5632          51 :   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
    5633          63 :   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
    5634             :     return DAG.getNode(Med3Opc, SL, VT,
    5635         147 :                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
    5636             :   }
    5637             : 
    5638             :   // If there isn't a 16-bit med3 operation, convert to 32-bit.
    5639           2 :   MVT NVT = MVT::i32;
    5640           2 :   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
    5641             : 
    5642           6 :   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
    5643           6 :   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
    5644           4 :   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
    5645             : 
    5646           2 :   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
    5647           2 :   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
    5648             : }
    5649             : 
    5650         627 : static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
    5651             :   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
    5652             :     return C;
    5653             : 
    5654          33 :   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
    5655          33 :     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
    5656             :       return C;
    5657             :   }
    5658             : 
    5659             :   return nullptr;
    5660             : }
    5661             : 
    5662         380 : SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    5663             :                                                   const SDLoc &SL,
    5664             :                                                   SDValue Op0,
    5665             :                                                   SDValue Op1) const {
    5666         380 :   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
    5667         380 :   if (!K1)
    5668         133 :     return SDValue();
    5669             : 
    5670         494 :   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
    5671         247 :   if (!K0)
    5672           3 :     return SDValue();
    5673             : 
    5674             :   // Ordered >= (although NaN inputs should have folded away by now).
    5675         732 :   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
    5676         244 :   if (Cmp == APFloat::cmpGreaterThan)
    5677           8 :     return SDValue();
    5678             : 
    5679             :   // TODO: Check IEEE bit enabled?
    5680         472 :   EVT VT = Op0.getValueType();
    5681         236 :   if (Subtarget->enableDX10Clamp()) {
    5682             :     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
    5683             :     // hardware fmed3 behavior converting to a min.
    5684             :     // FIXME: Should this be allowing -0.0?
    5685         227 :     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
    5686         348 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
    5687             :   }
    5688             : 
    5689             :   // med3 for f16 is only available on gfx9+, and not available for v2f16.
    5690          83 :   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
    5691             :     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    5692             :     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
    5693             :     // then give the other result, which is different from med3 with a NaN
    5694             :     // input.
    5695         106 :     SDValue Var = Op0.getOperand(0);
    5696          26 :     if (!isKnownNeverSNan(DAG, Var))
    5697          15 :       return SDValue();
    5698             : 
    5699             :     return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
    5700         114 :                        Var, SDValue(K0, 0), SDValue(K1, 0));
    5701             :   }
    5702             : 
    5703           9 :   return SDValue();
    5704             : }
    5705             : 
    5706        2833 : SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    5707             :                                                DAGCombinerInfo &DCI) const {
    5708        2833 :   SelectionDAG &DAG = DCI.DAG;
    5709             : 
    5710        5666 :   EVT VT = N->getValueType(0);
    5711        5666 :   unsigned Opc = N->getOpcode();
    5712        5666 :   SDValue Op0 = N->getOperand(0);
    5713        5666 :   SDValue Op1 = N->getOperand(1);
    5714             : 
    5715             :   // Only do this if the inner op has one use since this will just increases
    5716             :   // register pressure for no benefit.
    5717             : 
    5718             : 
    5719        2833 :   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
    5720        5616 :       VT != MVT::f64 &&
    5721        5549 :       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
    5722             :     // max(max(a, b), c) -> max3(a, b, c)
    5723             :     // min(min(a, b), c) -> min3(a, b, c)
    5724        5005 :     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
    5725          82 :       SDLoc DL(N);
    5726             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    5727             :                          DL,
    5728             :                          N->getValueType(0),
    5729          82 :                          Op0.getOperand(0),
    5730          82 :                          Op0.getOperand(1),
    5731         123 :                          Op1);
    5732             :     }
    5733             : 
    5734             :     // Try commuted.
    5735             :     // max(a, max(b, c)) -> max3(a, b, c)
    5736             :     // min(a, min(b, c)) -> min3(a, b, c)
    5737        4896 :     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
    5738          20 :       SDLoc DL(N);
    5739             :       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
    5740             :                          DL,
    5741             :                          N->getValueType(0),
    5742             :                          Op0,
    5743          20 :                          Op1.getOperand(0),
    5744          50 :                          Op1.getOperand(1));
    5745             :     }
    5746             :   }
    5747             : 
    5748             :   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
    5749        3246 :   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
    5750         180 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
    5751          45 :       return Med3;
    5752             :   }
    5753             : 
    5754        3103 :   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
    5755         124 :     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
    5756           6 :       return Med3;
    5757             :   }
    5758             : 
    5759             :   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
    5760        1734 :   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
    5761          21 :        (Opc == AMDGPUISD::FMIN_LEGACY &&
    5762          21 :         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
    5763         481 :       (VT == MVT::f32 || VT == MVT::f64 ||
    5764         119 :        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
    5765        3160 :        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
    5766         395 :       Op0.hasOneUse()) {
    5767         760 :     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
    5768         212 :       return Res;
    5769             :   }
    5770             : 
    5771        2519 :   return SDValue();
    5772             : }
    5773             : 
    5774         160 : static bool isClampZeroToOne(SDValue A, SDValue B) {
    5775          98 :   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
    5776          86 :     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
    5777             :       // FIXME: Should this be allowing -0.0?
    5778         136 :       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
    5779          62 :              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
    5780             :     }
    5781             :   }
    5782             : 
    5783             :   return false;
    5784             : }
    5785             : 
    5786             : // FIXME: Should only worry about snans for version with chain.
    5787         107 : SDValue SITargetLowering::performFMed3Combine(SDNode *N,
    5788             :                                               DAGCombinerInfo &DCI) const {
    5789         214 :   EVT VT = N->getValueType(0);
    5790             :   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
    5791             :   // NaNs. With a NaN input, the order of the operands may change the result.
    5792             : 
    5793         107 :   SelectionDAG &DAG = DCI.DAG;
    5794         214 :   SDLoc SL(N);
    5795             : 
    5796         214 :   SDValue Src0 = N->getOperand(0);
    5797         214 :   SDValue Src1 = N->getOperand(1);
    5798         214 :   SDValue Src2 = N->getOperand(2);
    5799             : 
    5800         107 :   if (isClampZeroToOne(Src0, Src1)) {
    5801             :     // const_a, const_b, x -> clamp is safe in all cases including signaling
    5802             :     // nans.
    5803             :     // FIXME: Should this be allowing -0.0?
    5804          36 :     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
    5805             :   }
    5806             : 
    5807             :   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
    5808             :   // handling no dx10-clamp?
    5809          71 :   if (Subtarget->enableDX10Clamp()) {
    5810             :     // If NaNs is clamped to 0, we are free to reorder the inputs.
    5811             : 
    5812             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    5813             :       std::swap(Src0, Src1);
    5814             : 
    5815             :     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
    5816             :       std::swap(Src1, Src2);
    5817             : 
    5818             :     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
    5819             :       std::swap(Src0, Src1);
    5820             : 
    5821          53 :     if (isClampZeroToOne(Src1, Src2))
    5822          12 :       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
    5823             :   }
    5824             : 
    5825          59 :   return SDValue();
    5826             : }
    5827             : 
    5828         133 : SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    5829             :                                                  DAGCombinerInfo &DCI) const {
    5830         266 :   SDValue Src0 = N->getOperand(0);
    5831         266 :   SDValue Src1 = N->getOperand(1);
    5832         277 :   if (Src0.isUndef() && Src1.isUndef())
    5833           6 :     return DCI.DAG.getUNDEF(N->getValueType(0));
    5834         130 :   return SDValue();
    5835             : }
    5836             : 
    5837      104353 : SDValue SITargetLowering::performExtractVectorEltCombine(
    5838             :   SDNode *N, DAGCombinerInfo &DCI) const {
    5839      208706 :   SDValue Vec = N->getOperand(0);
    5840             : 
    5841      104353 :   SelectionDAG &DAG= DCI.DAG;
    5842      208706 :   if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
    5843          28 :     SDLoc SL(N);
    5844          28 :     EVT EltVT = N->getValueType(0);
    5845          28 :     SDValue Idx = N->getOperand(1);
    5846             :     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
    5847          28 :                               Vec.getOperand(0), Idx);
    5848          14 :     return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
    5849             :   }
    5850             : 
    5851      104339 :   return SDValue();
    5852             : }
    5853             : 
    5854             : 
    5855         198 : unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
    5856             :                                           const SDNode *N0,
    5857             :                                           const SDNode *N1) const {
    5858         396 :   EVT VT = N0->getValueType(0);
    5859             : 
    5860             :   // Only do this if we are not trying to support denormals. v_mad_f32 does not
    5861             :   // support denormals ever.
    5862         290 :   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
    5863         194 :       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
    5864             :     return ISD::FMAD;
    5865             : 
    5866          90 :   const TargetOptions &Options = DAG.getTarget().Options;
    5867         180 :   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
    5868          96 :        (N0->getFlags().hasUnsafeAlgebra() &&
    5869         150 :         N1->getFlags().hasUnsafeAlgebra())) &&
    5870          48 :       isFMAFasterThanFMulAndFAdd(VT)) {
    5871             :     return ISD::FMA;
    5872             :   }
    5873             : 
    5874             :   return 0;
    5875             : }
    5876             : 
    5877      118316 : SDValue SITargetLowering::performAddCombine(SDNode *N,
    5878             :                                             DAGCombinerInfo &DCI) const {
    5879      118316 :   SelectionDAG &DAG = DCI.DAG;
    5880      236632 :   EVT VT = N->getValueType(0);
    5881             : 
    5882      118316 :   if (VT != MVT::i32)
    5883      103003 :     return SDValue();
    5884             : 
    5885       15313 :   SDLoc SL(N);
    5886       30626 :   SDValue LHS = N->getOperand(0);
    5887       30626 :   SDValue RHS = N->getOperand(1);
    5888             : 
    5889             :   // add x, zext (setcc) => addcarry x, 0, setcc
    5890             :   // add x, sext (setcc) => subcarry x, 0, setcc
    5891       30626 :   unsigned Opc = LHS.getOpcode();
    5892       30626 :   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
    5893       15313 :       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
    5894             :     std::swap(RHS, LHS);
    5895             : 
    5896       30626 :   Opc = RHS.getOpcode();
    5897       15313 :   switch (Opc) {
    5898             :   default: break;
    5899          58 :   case ISD::ZERO_EXTEND:
    5900             :   case ISD::SIGN_EXTEND:
    5901             :   case ISD::ANY_EXTEND: {
    5902         116 :     auto Cond = RHS.getOperand(0);
    5903          67 :     if (!isBoolSGPR(Cond))
    5904             :       break;
    5905          18 :     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
    5906          18 :     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
    5907           9 :     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
    5908           9 :     return DAG.getNode(Opc, SL, VTList, Args);
    5909             :   }
    5910           0 :   case ISD::ADDCARRY: {
    5911             :     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
    5912           0 :     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
    5913           0 :     if (!C || C->getZExtValue() != 0) break;
    5914           0 :     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
    5915           0 :     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
    5916             :   }
    5917             :   }
    5918       15304 :   return SDValue();
    5919             : }
    5920             : 
    5921        4569 : SDValue SITargetLowering::performSubCombine(SDNode *N,
    5922             :                                             DAGCombinerInfo &DCI) const {
    5923        4569 :   SelectionDAG &DAG = DCI.DAG;
    5924        9138 :   EVT VT = N->getValueType(0);
    5925             : 
    5926        4569 :   if (VT != MVT::i32)
    5927        2748 :     return SDValue();
    5928             : 
    5929        1821 :   SDLoc SL(N);
    5930        3642 :   SDValue LHS = N->getOperand(0);
    5931        3642 :   SDValue RHS = N->getOperand(1);
    5932             : 
    5933        3642 :   unsigned Opc = LHS.getOpcode();
    5934        1821 :   if (Opc != ISD::SUBCARRY)
    5935             :     std::swap(RHS, LHS);
    5936             : 
    5937        3642 :   if (LHS.getOpcode() == ISD::SUBCARRY) {
    5938             :     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
    5939           3 :     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
    5940           1 :     if (!C || C->getZExtValue() != 0)
    5941           0 :       return SDValue();
    5942           3 :     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
    5943           4 :     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
    5944             :   }
    5945        1820 :   return SDValue();
    5946             : }
    5947             : 
    5948          30 : SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
    5949             :   DAGCombinerInfo &DCI) const {
    5950             : 
    5951          60 :   if (N->getValueType(0) != MVT::i32)
    5952           0 :     return SDValue();
    5953             : 
    5954          74 :   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
    5955          14 :   if (!C || C->getZExtValue() != 0)
    5956          16 :     return SDValue();
    5957             : 
    5958          14 :   SelectionDAG &DAG = DCI.DAG;
    5959          28 :   SDValue LHS = N->getOperand(0);
    5960             : 
    5961             :   // addcarry (add x, y), 0, cc => addcarry x, y, cc
    5962             :   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
    5963          28 :   unsigned LHSOpc = LHS.getOpcode();
    5964          28 :   unsigned Opc = N->getOpcode();
    5965          28 :   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
    5966          14 :       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
    5967           4 :     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
    5968           4 :     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
    5969             :   }
    5970          13 :   return SDValue();
    5971             : }
    5972             : 
    5973        5447 : SDValue SITargetLowering::performFAddCombine(SDNode *N,
    5974             :                                              DAGCombinerInfo &DCI) const {
    5975        5447 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    5976        3751 :     return SDValue();
    5977             : 
    5978        1696 :   SelectionDAG &DAG = DCI.DAG;
    5979        3392 :   EVT VT = N->getValueType(0);
    5980             : 
    5981        1696 :   SDLoc SL(N);
    5982        3392 :   SDValue LHS = N->getOperand(0);
    5983        3392 :   SDValue RHS = N->getOperand(1);
    5984             : 
    5985             :   // These should really be instruction patterns, but writing patterns with
    5986             :   // source modiifiers is a pain.
    5987             : 
    5988             :   // fadd (fadd (a, a), b) -> mad 2.0, a, b
    5989        3392 :   if (LHS.getOpcode() == ISD::FADD) {
    5990         534 :     SDValue A = LHS.getOperand(0);
    5991         630 :     if (A == LHS.getOperand(1)) {
    5992          96 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    5993          96 :       if (FusedOp != 0) {
    5994          64 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    5995          64 :         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
    5996             :       }
    5997             :     }
    5998             :   }
    5999             : 
    6000             :   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
    6001        3264 :   if (RHS.getOpcode() == ISD::FADD) {
    6002         162 :     SDValue A = RHS.getOperand(0);
    6003         194 :     if (A == RHS.getOperand(1)) {
    6004          32 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    6005          32 :       if (FusedOp != 0) {
    6006          22 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    6007          22 :         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
    6008             :       }
    6009             :     }
    6010             :   }
    6011             : 
    6012        1610 :   return SDValue();
    6013             : }
    6014             : 
    6015        1714 : SDValue SITargetLowering::performFSubCombine(SDNode *N,
    6016             :                                              DAGCombinerInfo &DCI) const {
    6017        1714 :   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
    6018        1182 :     return SDValue();
    6019             : 
    6020         532 :   SelectionDAG &DAG = DCI.DAG;
    6021         532 :   SDLoc SL(N);
    6022        1064 :   EVT VT = N->getValueType(0);
    6023             :   assert(!VT.isVector());
    6024             : 
    6025             :   // Try to get the fneg to fold into the source modifier. This undoes generic
    6026             :   // DAG combines and folds them into the mad.
    6027             :   //
    6028             :   // Only do this if we are not trying to support denormals. v_mad_f32 does
    6029             :   // not support denormals ever.
    6030        1064 :   SDValue LHS = N->getOperand(0);
    6031        1064 :   SDValue RHS = N->getOperand(1);
    6032        1064 :   if (LHS.getOpcode() == ISD::FADD) {
    6033             :     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
    6034          90 :     SDValue A = LHS.getOperand(0);
    6035         116 :     if (A == LHS.getOperand(1)) {
    6036          26 :       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
    6037          26 :       if (FusedOp != 0){
    6038          19 :         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
    6039          19 :         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
    6040             : 
    6041          19 :         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
    6042             :       }
    6043             :     }
    6044             :   }
    6045             : 
    6046        1026 :   if (RHS.getOpcode() == ISD::FADD) {
    6047             :     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
    6048             : 
    6049         100 :     SDValue A = RHS.getOperand(0);
    6050         144 :     if (A == RHS.getOperand(1)) {
    6051          44 :       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
    6052          44 :       if (FusedOp != 0){
    6053          35 :         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
    6054          35 :         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
    6055             :       }
    6056             :     }
    6057             :   }
    6058             : 
    6059         478 :   return SDValue();
    6060             : }
    6061             : 
    6062       10644 : SDValue SITargetLowering::performSetCCCombine(SDNode *N,
    6063             :                                               DAGCombinerInfo &DCI) const {
    6064       10644 :   SelectionDAG &DAG = DCI.DAG;
    6065       21288 :   SDLoc SL(N);
    6066             : 
    6067       21288 :   SDValue LHS = N->getOperand(0);
    6068       21288 :   SDValue RHS = N->getOperand(1);
    6069       21288 :   EVT VT = LHS.getValueType();
    6070       31932 :   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
    6071             : 
    6072             :   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
    6073             :   if (!CRHS) {
    6074           0 :     CRHS = dyn_cast<ConstantSDNode>(LHS);
    6075             :     if (CRHS) {
    6076           0 :       std::swap(LHS, RHS);
    6077           0 :       CC = getSetCCSwappedOperands(CC);
    6078             :     }
    6079             :   }
    6080             : 
    6081       23340 :   if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
    6082          35 :       isBoolSGPR(LHS.getOperand(0))) {
    6083             :     // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
    6084             :     // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
    6085             :     // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
    6086             :     // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
    6087           6 :     if ((CRHS->isAllOnesValue() &&
    6088           3 :          (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
    6089           0 :         (CRHS->isNullValue() &&
    6090           0 :          (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
    6091           6 :       return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
    6092           9 :                          DAG.getConstant(-1, SL, MVT::i1));
    6093           0 :     if ((CRHS->isAllOnesValue() &&
    6094           0 :          (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
    6095           0 :         (CRHS->isNullValue() &&
    6096           0 :          (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
    6097           0 :       return LHS.getOperand(0);
    6098             :   }
    6099             : 
    6100       29214 :   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
    6101        3953 :                                            VT != MVT::f16))
    6102        3724 :     return SDValue();
    6103             : 
    6104             :   // Match isinf pattern
    6105             :   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
    6106        7113 :   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
    6107           6 :     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
    6108             :     if (!CRHS)
    6109           0 :       return SDValue();
    6110             : 
    6111           6 :     const APFloat &APF = CRHS->getValueAPF();
    6112          12 :     if (APF.isInfinity() && !APF.isNegative()) {
    6113           2 :       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
    6114           4 :       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
    6115           6 :                          DAG.getConstant(Mask, SL, MVT::i32));
    6116             :     }
    6117             :   }
    6118             : 
    6119        6915 :   return SDValue();
    6120             : }
    6121             : 
    6122         360 : SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
    6123             :                                                      DAGCombinerInfo &DCI) const {
    6124         360 :   SelectionDAG &DAG = DCI.DAG;
    6125         720 :   SDLoc SL(N);
    6126         720 :   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
    6127             : 
    6128         720 :   SDValue Src = N->getOperand(0);
    6129         720 :   SDValue Srl = N->getOperand(0);
    6130         720 :   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
    6131         104 :     Srl = Srl.getOperand(0);
    6132             : 
    6133             :   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
    6134         720 :   if (Srl.getOpcode() == ISD::SRL) {
    6135             :     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
    6136             :     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
    6137             :     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
    6138             : 
    6139             :     if (const ConstantSDNode *C =
    6140         177 :         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
    6141         295 :       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
    6142         118 :                                EVT(MVT::i32));
    6143             : 
    6144          59 :       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
    6145          59 :       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
    6146          59 :         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
    6147         177 :                            MVT::f32, Srl);
    6148             :       }
    6149             :     }
    6150             :   }
    6151             : 
    6152         602 :   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
    6153             : 
    6154         602 :   KnownBits Known;
    6155         301 :   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
    6156        1204 :                                         !DCI.isBeforeLegalizeOps());
    6157         301 :   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    6158         602 :   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
    6159         301 :       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
    6160          95 :     DCI.CommitTargetLoweringOpt(TLO);
    6161             :   }
    6162             : 
    6163         301 :   return SDValue();
    6164             : }
    6165             : 
    6166     1018475 : SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    6167             :                                             DAGCombinerInfo &DCI) const {
    6168     2036950 :   switch (N->getOpcode()) {
    6169      239518 :   default:
    6170      239518 :     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    6171      118316 :   case ISD::ADD:
    6172      118316 :     return performAddCombine(N, DCI);
    6173        4569 :   case ISD::SUB:
    6174        4569 :     return performSubCombine(N, DCI);
    6175          30 :   case ISD::ADDCARRY:
    6176             :   case ISD::SUBCARRY:
    6177          30 :     return performAddCarrySubCarryCombine(N, DCI);
    6178        5447 :   case ISD::FADD:
    6179        5447 :     return performFAddCombine(N, DCI);
    6180        1714 :   case ISD::FSUB:
    6181        1714 :     return performFSubCombine(N, DCI);
    6182       10644 :   case ISD::SETCC:
    6183       10644 :     return performSetCCCombine(N, DCI);
    6184        7827 :   case ISD::FMAXNUM:
    6185             :   case ISD::FMINNUM:
    6186             :   case ISD::SMAX:
    6187             :   case ISD::SMIN:
    6188             :   case ISD::UMAX:
    6189             :   case ISD::UMIN:
    6190             :   case AMDGPUISD::FMIN_LEGACY:
    6191             :   case AMDGPUISD::FMAX_LEGACY: {
    6192       10660 :     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
    6193        2833 :         getTargetMachine().getOptLevel() > CodeGenOpt::None)
    6194        2833 :       return performMinMaxCombine(N, DCI);
    6195             :     break;
    6196             :   }
    6197      465255 :   case ISD::LOAD:
    6198             :   case ISD::STORE:
    6199             :   case ISD::ATOMIC_LOAD:
    6200             :   case ISD::ATOMIC_STORE:
    6201             :   case ISD::ATOMIC_CMP_SWAP:
    6202             :   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
    6203             :   case ISD::ATOMIC_SWAP:
    6204             :   case ISD::ATOMIC_LOAD_ADD:
    6205             :   case ISD::ATOMIC_LOAD_SUB:
    6206             :   case ISD::ATOMIC_LOAD_AND:
    6207             :   case ISD::ATOMIC_LOAD_OR:
    6208             :   case ISD::ATOMIC_LOAD_XOR:
    6209             :   case ISD::ATOMIC_LOAD_NAND:
    6210             :   case ISD::ATOMIC_LOAD_MIN:
    6211             :   case ISD::ATOMIC_LOAD_MAX:
    6212             :   case ISD::ATOMIC_LOAD_UMIN:
    6213             :   case ISD::ATOMIC_LOAD_UMAX:
    6214             :   case AMDGPUISD::ATOMIC_INC:
    6215             :   case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
    6216      465255 :     if (DCI.isBeforeLegalize())
    6217             :       break;
    6218      321025 :     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
    6219       27906 :   case ISD::AND:
    6220       27906 :     return performAndCombine(N, DCI);
    6221       16821 :   case ISD::OR:
    6222       16821 :     return performOrCombine(N, DCI);
    6223        1113 :   case ISD::XOR:
    6224        1113 :     return performXorCombine(N, DCI);
    6225       12685 :   case ISD::ZERO_EXTEND:
    6226       12685 :     return performZeroExtendCombine(N, DCI);
    6227          74 :   case AMDGPUISD::FP_CLASS:
    6228          74 :     return performClassCombine(N, DCI);
    6229         401 :   case ISD::FCANONICALIZE:
    6230         401 :     return performFCanonicalizeCombine(N, DCI);
    6231         549 :   case AMDGPUISD::FRACT:
    6232             :   case AMDGPUISD::RCP:
    6233             :   case AMDGPUISD::RSQ:
    6234             :   case AMDGPUISD::RCP_LEGACY:
    6235             :   case AMDGPUISD::RSQ_LEGACY:
    6236             :   case AMDGPUISD::RSQ_CLAMP:
    6237             :   case AMDGPUISD::LDEXP: {
    6238        1098 :     SDValue Src = N->getOperand(0);
    6239        1098 :     if (Src.isUndef())
    6240          11 :       return Src;
    6241             :     break;
    6242             :   }
    6243        1002 :   case ISD::SINT_TO_FP:
    6244             :   case ISD::UINT_TO_FP:
    6245        1002 :     return performUCharToFloatCombine(N, DCI);
    6246         360 :   case AMDGPUISD::CVT_F32_UBYTE0:
    6247             :   case AMDGPUISD::CVT_F32_UBYTE1:
    6248             :   case AMDGPUISD::CVT_F32_UBYTE2:
    6249             :   case AMDGPUISD::CVT_F32_UBYTE3:
    6250         360 :     return performCvtF32UByteNCombine(N, DCI);
    6251         107 :   case AMDGPUISD::FMED3:
    6252         107 :     return performFMed3Combine(N, DCI);
    6253         133 :   case AMDGPUISD::CVT_PKRTZ_F16_F32:
    6254         133 :     return performCvtPkRTZCombine(N, DCI);
    6255          87 :   case ISD::SCALAR_TO_VECTOR: {
    6256          87 :     SelectionDAG &DAG = DCI.DAG;
    6257         174 :     EVT VT = N->getValueType(0);
    6258             : 
    6259             :     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
    6260         159 :     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
    6261          52 :       SDLoc SL(N);
    6262          52 :       SDValue Src = N->getOperand(0);
    6263          52 :       EVT EltVT = Src.getValueType();
    6264          41 :       if (EltVT == MVT::f16)
    6265          22 :         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
    6266             : 
    6267          52 :       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
    6268          26 :       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
    6269             :     }
    6270             : 
    6271          61 :     break;
    6272             :   }
    6273      103917 :   case ISD::EXTRACT_VECTOR_ELT:
    6274      103917 :     return performExtractVectorEltCombine(N, DCI);
    6275             :   }
    6276      149823 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    6277             : }
    6278             : 
    6279             : /// \brief Helper function for adjustWritemask
    6280             : static unsigned SubIdx2Lane(unsigned Idx) {
    6281             :   switch (Idx) {
    6282             :   default: return 0;
    6283             :   case AMDGPU::sub0: return 0;
    6284             :   case AMDGPU::sub1: return 1;
    6285             :   case AMDGPU::sub2: return 2;
    6286             :   case AMDGPU::sub3: return 3;
    6287             :   }
    6288             : }
    6289             : 
    6290             : /// \brief Adjust the writemask of MIMG instructions
    6291         258 : void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    6292             :                                        SelectionDAG &DAG) const {
    6293         258 :   SDNode *Users[4] = { };
    6294         258 :   unsigned Lane = 0;
    6295         774 :   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
    6296         516 :   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
    6297         258 :   unsigned NewDmask = 0;
    6298             : 
    6299             :   // Try to figure out the used register components
    6300         258 :   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    6301         680 :        I != E; ++I) {
    6302             : 
    6303             :     // Don't look at users of the chain.
    6304        1116 :     if (I.getUse().getResNo() != 0)
    6305          82 :       continue;
    6306             : 
    6307             :     // Abort if we can't understand the usage
    6308         936 :     if (!I->isMachineOpcode() ||
    6309         460 :         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    6310             :       return;
    6311             : 
    6312             :     // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
    6313             :     // Note that subregs are packed, i.e. Lane==0 is the first bit set
    6314             :     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
    6315             :     // set, etc.
    6316         684 :     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    6317             : 
    6318             :     // Set which texture component corresponds to the lane.
    6319             :     unsigned Comp;
    6320        1127 :     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
    6321             :       assert(Dmask);
    6322         785 :       Comp = countTrailingZeros(Dmask);
    6323         785 :       Dmask &= ~(1 << Comp);
    6324             :     }
    6325             : 
    6326             :     // Abort if we have more than one user per component
    6327         342 :     if (Users[Lane])
    6328             :       return;
    6329             : 
    6330         340 :     Users[Lane] = *I;
    6331         340 :     NewDmask |= 1 << Comp;
    6332             :   }
    6333             : 
    6334             :   // Abort if there's no change
    6335         122 :   if (NewDmask == OldDmask)
    6336             :     return;
    6337             : 
    6338             :   // Adjust the writemask in the node
    6339         106 :   std::vector<SDValue> Ops;
    6340         288 :   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
    6341         360 :   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
    6342         360 :   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
    6343          72 :   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
    6344             : 
    6345             :   // If we only got one lane, replace it with a copy
    6346             :   // (if NewDmask has only one bit set...)
    6347          72 :   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
    6348         114 :     SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
    6349          76 :                                        MVT::i32);
    6350         190 :     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
    6351         152 :                                       SDLoc(), Users[Lane]->getValueType(0),
    6352          38 :                                       SDValue(Node, 0), RC);
    6353          38 :     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    6354             :     return;
    6355             :   }
    6356             : 
    6357             :   // Update the users of the node with the new indices
    6358         306 :   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    6359         136 :     SDNode *User = Users[i];
    6360         136 :     if (!User)
    6361          48 :       continue;
    6362             : 
    6363         352 :     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
    6364         176 :     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
    6365             : 
    6366          88 :     switch (Idx) {
    6367             :     default: break;
    6368          34 :     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    6369          34 :     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    6370          20 :     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    6371             :     }
    6372             :   }
    6373             : }
    6374             : 
    6375             : static bool isFrameIndexOp(SDValue Op) {
    6376      658682 :   if (Op.getOpcode() == ISD::AssertZext)
    6377          92 :     Op = Op.getOperand(0);
    6378             : 
    6379      329318 :   return isa<FrameIndexSDNode>(Op);
    6380             : }
    6381             : 
    6382             : /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
    6383             : /// with frame index operands.
    6384             : /// LLVM assumes that inputs are to these instructions are registers.
    6385       54264 : SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
    6386             :                                                         SelectionDAG &DAG) const {
    6387       54264 :   if (Node->getOpcode() == ISD::CopyToReg) {
    6388       27990 :     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
    6389       18660 :     SDValue SrcVal = Node->getOperand(2);
    6390             : 
    6391             :     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
    6392             :     // to try understanding copies to physical registers.
    6393       18815 :     if (SrcVal.getValueType() == MVT::i1 &&
    6394         310 :         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
    6395          16 :       SDLoc SL(Node);
    6396           8 :       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    6397             :       SDValue VReg = DAG.getRegister(
    6398           8 :         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
    6399             : 
    6400           2 :       SDNode *Glued = Node->getGluedNode();
    6401             :       SDValue ToVReg
    6402          16 :         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
    6403          18 :                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
    6404             :       SDValue ToResultReg
    6405             :         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
    6406          24 :                            VReg, ToVReg.getValue(1));
    6407           8 :       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
    6408           8 :       DAG.RemoveDeadNode(Node);
    6409           8 :       return ToResultReg.getNode();
    6410             :     }
    6411             :   }
    6412             : 
    6413       54256 :   SmallVector<SDValue, 8> Ops;
    6414      767194 :   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
    6415     1317318 :     if (!isFrameIndexOp(Node->getOperand(i))) {
    6416      658636 :       Ops.push_back(Node->getOperand(i));
    6417      329318 :       continue;
    6418             :     }
    6419             : 
    6420          46 :     SDLoc DL(Node);
    6421          46 :     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
    6422          46 :                                      Node->getOperand(i).getValueType(),
    6423          46 :                                      Node->getOperand(i)), 0));
    6424             :   }
    6425             : 
    6426       54256 :   DAG.UpdateNodeOperands(Node, Ops);
    6427       54256 :   return Node;
    6428             : }
    6429             : 
    6430             : /// \brief Fold the instructions after selecting them.
    6431      355826 : SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
    6432             :                                           SelectionDAG &DAG) const {
    6433      711652 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6434      711652 :   unsigned Opcode = Node->getMachineOpcode();
    6435             : 
    6436      712766 :   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
    6437         668 :       !TII->isGather4(Opcode))
    6438         258 :     adjustWritemask(Node, DAG);
    6439             : 
    6440      711652 :   if (Opcode == AMDGPU::INSERT_SUBREG ||
    6441      355826 :       Opcode == AMDGPU::REG_SEQUENCE) {
    6442       44934 :     legalizeTargetIndependentNode(Node, DAG);
    6443       44934 :     return Node;
    6444             :   }
    6445             : 
    6446      310892 :   switch (Opcode) {
    6447         255 :   case AMDGPU::V_DIV_SCALE_F32:
    6448             :   case AMDGPU::V_DIV_SCALE_F64: {
    6449             :     // Satisfy the operand register constraint when one of the inputs is
    6450             :     // undefined. Ordinarily each undef value will have its own implicit_def of
    6451             :     // a vreg, so force these to use a single register.
    6452         510 :     SDValue Src0 = Node->getOperand(0);
    6453         510 :     SDValue Src1 = Node->getOperand(1);
    6454         510 :     SDValue Src2 = Node->getOperand(2);
    6455             : 
    6456         507 :     if ((Src0.isMachineOpcode() &&
    6457         255 :          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
    6458         124 :         (Src0 == Src1 || Src0 == Src2))
    6459             :       break;
    6460             : 
    6461          12 :     MVT VT = Src0.getValueType().getSimpleVT();
    6462           6 :     const TargetRegisterClass *RC = getRegClassFor(VT);
    6463             : 
    6464           6 :     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    6465           6 :     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
    6466             : 
    6467          18 :     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
    6468          18 :                                       UndefReg, Src0, SDValue());
    6469             : 
    6470             :     // src0 must be the same register as src1 or src2, even if the value is
    6471             :     // undefined, so make sure we don't violate this constraint.
    6472          15 :     if (Src0.isMachineOpcode() &&
    6473           3 :         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
    6474           9 :       if (Src1.isMachineOpcode() &&
    6475           3 :           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    6476             :         Src0 = Src1;
    6477           9 :       else if (Src2.isMachineOpcode() &&
    6478           3 :                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
    6479             :         Src0 = Src2;
    6480             :       else {
    6481             :         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
    6482           1 :         Src0 = UndefReg;
    6483           1 :         Src1 = UndefReg;
    6484             :       }
    6485             :     } else
    6486             :       break;
    6487             : 
    6488           9 :     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
    6489           6 :     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
    6490           0 :       Ops.push_back(Node->getOperand(I));
    6491             : 
    6492           6 :     Ops.push_back(ImpDef.getValue(1));
    6493          12 :     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    6494             :   }
    6495             :   default:
    6496             :     break;
    6497             :   }
    6498             : 
    6499      310889 :   return Node;
    6500             : }
    6501             : 
    6502             : /// \brief Assign the register class depending on the number of
    6503             : /// bits set in the writemask
    6504       29610 : void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
    6505             :                                                      SDNode *Node) const {
    6506       59220 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6507             : 
    6508       29610 :   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    6509             : 
    6510       88830 :   if (TII->isVOP3(MI.getOpcode())) {
    6511             :     // Make sure constant bus requirements are respected.
    6512       27690 :     TII->legalizeOperandsVOP3(MRI, MI);
    6513       27690 :     return;
    6514             :   }
    6515             : 
    6516        1920 :   if (TII->isMIMG(MI)) {
    6517         258 :     unsigned VReg = MI.getOperand(0).getReg();
    6518         258 :     const TargetRegisterClass *RC = MRI.getRegClass(VReg);
    6519             :     // TODO: Need mapping tables to handle other cases (register classes).
    6520         258 :     if (RC != &AMDGPU::VReg_128RegClass)
    6521             :       return;
    6522             : 
    6523         248 :     unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
    6524         496 :     unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
    6525         248 :     unsigned BitsSet = 0;
    6526        1240 :     for (unsigned i = 0; i < 4; ++i)
    6527         992 :       BitsSet += Writemask & (1 << i) ? 1 : 0;
    6528         248 :     switch (BitsSet) {
    6529             :     default: return;
    6530             :     case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
    6531          14 :     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
    6532          20 :     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
    6533             :     }
    6534             : 
    6535          72 :     unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
    6536         216 :     MI.setDesc(TII->get(NewOpcode));
    6537          72 :     MRI.setRegClass(VReg, RC);
    6538          72 :     return;
    6539             :   }
    6540             : 
    6541             :   // Replace unused atomics with the no return version.
    6542        1662 :   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
    6543        1662 :   if (NoRetAtomicOp != -1) {
    6544        1640 :     if (!Node->hasAnyUseOfValue(0)) {
    6545        2568 :       MI.setDesc(TII->get(NoRetAtomicOp));
    6546         856 :       MI.RemoveOperand(0);
    6547         856 :       return;
    6548             :     }
    6549             : 
    6550             :     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
    6551             :     // instruction, because the return type of these instructions is a vec2 of
    6552             :     // the memory type, so it can be tied to the input operand.
    6553             :     // This means these instructions always have a use, so we need to add a
    6554             :     // special case to check if the atomic has only one extract_subreg use,
    6555             :     // which itself has no uses.
    6556        1566 :     if ((Node->hasNUsesOfValue(1, 0) &&
    6557        3104 :          Node->use_begin()->isMachineOpcode() &&
    6558        3070 :          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
    6559          36 :          !Node->use_begin()->hasAnyUseOfValue(0))) {
    6560           0 :       unsigned Def = MI.getOperand(0).getReg();
    6561             : 
    6562             :       // Change this into a noret atomic.
    6563           0 :       MI.setDesc(TII->get(NoRetAtomicOp));
    6564           0 :       MI.RemoveOperand(0);
    6565             : 
    6566             :       // If we only remove the def operand from the atomic instruction, the
    6567             :       // extract_subreg will be left with a use of a vreg without a def.
    6568             :       // So we need to insert an implicit_def to avoid machine verifier
    6569             :       // errors.
    6570           0 :       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
    6571           0 :               TII->get(AMDGPU::IMPLICIT_DEF), Def);
    6572             :     }
    6573             :     return;
    6574             :   }
    6575             : }
    6576             : 
    6577       43992 : static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
    6578             :                               uint64_t Val) {
    6579       87984 :   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
    6580       43992 :   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
    6581             : }
    6582             : 
    6583        3966 : MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
    6584             :                                                 const SDLoc &DL,
    6585             :                                                 SDValue Ptr) const {
    6586        7932 :   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
    6587             : 
    6588             :   // Build the half of the subregister with the constants before building the
    6589             :   // full 128-bit register. If we are building multiple resource descriptors,
    6590             :   // this will allow CSEing of the 2-component register.
    6591             :   const SDValue Ops0[] = {
    6592        3966 :     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
    6593             :     buildSMovImm32(DAG, DL, 0),
    6594        3966 :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    6595        3966 :     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
    6596        3966 :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
    6597       19830 :   };
    6598             : 
    6599        7932 :   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
    6600       11898 :                                                 MVT::v2i32, Ops0), 0);
    6601             : 
    6602             :   // Combine the constants and the pointer.
    6603             :   const SDValue Ops1[] = {
    6604        3966 :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    6605             :     Ptr,
    6606        3966 :     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
    6607             :     SubRegHi,
    6608        3966 :     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
    6609       15864 :   };
    6610             : 
    6611        7932 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
    6612             : }
    6613             : 
    6614             : /// \brief Return a resource descriptor with the 'Add TID' bit enabled
    6615             : ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
    6616             : ///        of the resource descriptor) to create an offset, which is added to
    6617             : ///        the resource pointer.
    6618       18030 : MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
    6619             :                                            SDValue Ptr, uint32_t RsrcDword1,
    6620             :                                            uint64_t RsrcDword2And3) const {
    6621       18030 :   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
    6622       18030 :   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
    6623       18030 :   if (RsrcDword1) {
    6624           0 :     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
    6625           0 :                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
    6626             :                     0);
    6627             :   }
    6628             : 
    6629             :   SDValue DataLo = buildSMovImm32(DAG, DL,
    6630       18030 :                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
    6631       18030 :   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
    6632             : 
    6633             :   const SDValue Ops[] = {
    6634       18030 :     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
    6635             :     PtrLo,
    6636       18030 :     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
    6637             :     PtrHi,
    6638       18030 :     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
    6639             :     DataLo,
    6640       18030 :     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
    6641             :     DataHi,
    6642       18030 :     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
    6643      108180 :   };
    6644             : 
    6645       36060 :   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
    6646             : }
    6647             : 
    6648             : //===----------------------------------------------------------------------===//
    6649             : //                         SI Inline Assembly Support
    6650             : //===----------------------------------------------------------------------===//
    6651             : 
    6652             : std::pair<unsigned, const TargetRegisterClass *>
    6653        1826 : SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    6654             :                                                StringRef Constraint,
    6655             :                                                MVT VT) const {
    6656        4501 :   if (!isTypeLegal(VT))
    6657         977 :     return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    6658             : 
    6659         849 :   if (Constraint.size() == 1) {
    6660         928 :     switch (Constraint[0]) {
    6661         275 :     case 's':
    6662             :     case 'r':
    6663         275 :       switch (VT.getSizeInBits()) {
    6664           0 :       default:
    6665           0 :         return std::make_pair(0U, nullptr);
    6666         142 :       case 32:
    6667             :       case 16:
    6668         142 :         return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
    6669          63 :       case 64:
    6670          63 :         return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
    6671          10 :       case 128:
    6672          10 :         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
    6673          44 :       case 256:
    6674          44 :         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
    6675          16 :       case 512:
    6676          16 :         return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
    6677             :       }
    6678             : 
    6679         189 :     case 'v':
    6680         189 :       switch (VT.getSizeInBits()) {
    6681           0 :       default:
    6682           0 :         return std::make_pair(0U, nullptr);
    6683         126 :       case 32:
    6684             :       case 16:
    6685         126 :         return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
    6686          36 :       case 64:
    6687          36 :         return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
    6688           0 :       case 96:
    6689           0 :         return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
    6690          27 :       case 128:
    6691          27 :         return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
    6692           0 :       case 256:
    6693           0 :         return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
    6694           0 :       case 512:
    6695           0 :         return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
    6696             :       }
    6697             :     }
    6698             :   }
    6699             : 
    6700         385 :   if (Constraint.size() > 1) {
    6701         385 :     const TargetRegisterClass *RC = nullptr;
    6702         770 :     if (Constraint[1] == 'v') {
    6703             :       RC = &AMDGPU::VGPR_32RegClass;
    6704         279 :     } else if (Constraint[1] == 's') {
    6705             :       RC = &AMDGPU::SGPR_32RegClass;
    6706             :     }
    6707             : 
    6708             :     if (RC) {
    6709             :       uint32_t Idx;
    6710         704 :       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
    6711         352 :       if (!Failed && Idx < RC->getNumRegs())
    6712           0 :         return std::make_pair(RC->getRegister(Idx), RC);
    6713             :     }
    6714             :   }
    6715         385 :   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
    6716             : }
    6717             : 
    6718             : SITargetLowering::ConstraintType
    6719        5907 : SITargetLowering::getConstraintType(StringRef Constraint) const {
    6720        5907 :   if (Constraint.size() == 1) {
    6721        4142 :     switch (Constraint[0]) {
    6722             :     default: break;
    6723             :     case 's':
    6724             :     case 'v':
    6725             :       return C_RegisterClass;
    6726             :     }
    6727             :   }
    6728        3980 :   return TargetLowering::getConstraintType(Constraint);
    6729             : }
    6730             : 
    6731             : // Figure out which registers should be reserved for stack access. Only after
    6732             : // the function is legalized do we know all of the non-spill stack objects or if
    6733             : // calls are present.
    6734       14833 : void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    6735       14833 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    6736       14833 :   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    6737       14833 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
    6738       14833 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    6739       14833 :   const SIRegisterInfo *TRI = ST.getRegisterInfo();
    6740             : 
    6741       14833 :   if (Info->isEntryFunction()) {
    6742             :     // Callable functions have fixed registers used for stack access.
    6743       14174 :     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
    6744             :   }
    6745             : 
    6746             :   // We have to assume the SP is needed in case there are calls in the function
    6747             :   // during lowering. Calls are only detected after the function is
    6748             :   // lowered. We're about to reserve registers, so don't bother using it if we
    6749             :   // aren't really going to use it.
    6750       29007 :   bool NeedSP = !Info->isEntryFunction() ||
    6751       29005 :     MFI.hasVarSizedObjects() ||
    6752         988 :     MFI.hasCalls();
    6753             : 
    6754             :   if (NeedSP) {
    6755         988 :     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
    6756        1976 :     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
    6757             : 
    6758             :     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
    6759             :     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
    6760             :                                Info->getStackPtrOffsetReg()));
    6761         988 :     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
    6762             :   }
    6763             : 
    6764       14833 :   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
    6765       14833 :   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
    6766       14833 :   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
    6767             :                      Info->getScratchWaveOffsetReg());
    6768             : 
    6769       14833 :   TargetLoweringBase::finalizeLowering(MF);
    6770      231751 : }

Generated by: LCOV version 1.13