LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - R600ISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 855 937 91.2 %
Date: 2018-02-22 04:41:24 Functions: 37 39 94.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Custom DAG lowering for R600
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "R600ISelLowering.h"
      16             : #include "AMDGPUFrameLowering.h"
      17             : #include "AMDGPUIntrinsicInfo.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "R600Defines.h"
      20             : #include "R600FrameLowering.h"
      21             : #include "R600InstrInfo.h"
      22             : #include "R600MachineFunctionInfo.h"
      23             : #include "Utils/AMDGPUBaseInfo.h"
      24             : #include "llvm/ADT/APFloat.h"
      25             : #include "llvm/ADT/APInt.h"
      26             : #include "llvm/ADT/ArrayRef.h"
      27             : #include "llvm/ADT/DenseMap.h"
      28             : #include "llvm/ADT/SmallVector.h"
      29             : #include "llvm/CodeGen/CallingConvLower.h"
      30             : #include "llvm/CodeGen/DAGCombine.h"
      31             : #include "llvm/CodeGen/ISDOpcodes.h"
      32             : #include "llvm/CodeGen/MachineBasicBlock.h"
      33             : #include "llvm/CodeGen/MachineFunction.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineMemOperand.h"
      37             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      38             : #include "llvm/CodeGen/MachineValueType.h"
      39             : #include "llvm/CodeGen/SelectionDAG.h"
      40             : #include "llvm/IR/Constants.h"
      41             : #include "llvm/IR/DerivedTypes.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include "llvm/Support/Compiler.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include <cassert>
      46             : #include <cstdint>
      47             : #include <iterator>
      48             : #include <utility>
      49             : #include <vector>
      50             : 
      51             : using namespace llvm;
      52             : 
      53         282 : R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
      54         282 :                                        const R600Subtarget &STI)
      55         282 :     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
      56             :   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
      57             :   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
      58             :   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
      59             :   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
      60             :   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
      61             :   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
      62             : 
      63         564 :   computeRegisterProperties(STI.getRegisterInfo());
      64             : 
      65             :   // Legalize loads and stores to the private address space.
      66             :   setOperationAction(ISD::LOAD, MVT::i32, Custom);
      67             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
      68             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
      69             : 
      70             :   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
      71             :   // spaces, so it is custom lowered to handle those where it isn't.
      72        1974 :   for (MVT VT : MVT::integer_valuetypes()) {
      73             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
      74             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
      75             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
      76             : 
      77             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
      78             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
      79             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
      80             : 
      81             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
      82             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
      83             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
      84             :   }
      85             : 
      86             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
      87             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      88             :   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      89             :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      90             : 
      91             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      92             :   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      93             :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      94             : 
      95             :   setOperationAction(ISD::STORE, MVT::i8, Custom);
      96             :   setOperationAction(ISD::STORE, MVT::i32, Custom);
      97             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
      98             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
      99             : 
     100             :   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
     101             :   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
     102             :   // We need to include these since trunc STORES to PRIVATE need
     103             :   // special handling to accommodate RMW
     104             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
     105             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
     106             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
     107             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
     108             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
     109             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
     110             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
     111             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
     112             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
     113             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
     114             : 
     115             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
     116             :   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
     117             :   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
     118             : 
     119             :   // Set condition code actions
     120             :   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     121             :   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     122             :   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     123             :   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     124             :   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     125             :   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     126             :   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     127             :   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     128             :   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     129             :   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     130             :   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     131             :   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     132             : 
     133             :   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     134             :   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     135             :   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     136             :   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     137             : 
     138             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     139             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     140             : 
     141             :   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     142             :   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     143             : 
     144             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     145             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     146             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     147             : 
     148             :   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     149             : 
     150             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     151             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     152             : 
     153             :   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     154             :   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     155             :   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     156             :   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
     157             :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     158             :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     159             : 
     160             :   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     161             :   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     162             :   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     163             :   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     164             : 
     165             :   // ADD, SUB overflow.
     166             :   // TODO: turn these into Legal?
     167         564 :   if (Subtarget->hasCARRY())
     168             :     setOperationAction(ISD::UADDO, MVT::i32, Custom);
     169             : 
     170         282 :   if (Subtarget->hasBORROW())
     171             :     setOperationAction(ISD::USUBO, MVT::i32, Custom);
     172             : 
     173             :   // Expand sign extension of vectors
     174         282 :   if (!Subtarget->hasBFE())
     175             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
     176             : 
     177             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
     178             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
     179             : 
     180         282 :   if (!Subtarget->hasBFE())
     181             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
     182             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
     183             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
     184             : 
     185         282 :   if (!Subtarget->hasBFE())
     186             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     187             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
     188             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
     189             : 
     190             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
     191             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
     192             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
     193             : 
     194             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
     195             : 
     196             :   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
     197             : 
     198             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
     199             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
     200             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     201             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     202             : 
     203             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
     204             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
     205             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
     206             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     207             : 
     208             :   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
     209             :   //  to be Legal/Custom in order to avoid library calls.
     210             :   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
     211             :   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
     212             :   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
     213             : 
     214         282 :   if (!Subtarget->hasFMA()) {
     215             :     setOperationAction(ISD::FMA, MVT::f32, Expand);
     216             :     setOperationAction(ISD::FMA, MVT::f64, Expand);
     217             :   }
     218             : 
     219             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     220             : 
     221         282 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     222        1410 :   for (MVT VT : ScalarIntVTs) {
     223             :     setOperationAction(ISD::ADDC, VT, Expand);
     224             :     setOperationAction(ISD::SUBC, VT, Expand);
     225             :     setOperationAction(ISD::ADDE, VT, Expand);
     226             :     setOperationAction(ISD::SUBE, VT, Expand);
     227             :   }
     228             : 
     229             :   // LLVM will expand these to atomic_cmp_swap(0)
     230             :   // and atomic_swap, respectively.
     231             :   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
     232             :   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
     233             : 
     234             :   // We need to custom lower some of the intrinsics
     235             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     236             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     237             : 
     238             :   setSchedulingPreference(Sched::Source);
     239             : 
     240             :   setTargetDAGCombine(ISD::FP_ROUND);
     241             :   setTargetDAGCombine(ISD::FP_TO_SINT);
     242             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     243             :   setTargetDAGCombine(ISD::SELECT_CC);
     244             :   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     245             :   setTargetDAGCombine(ISD::LOAD);
     246         282 : }
     247             : 
     248      408878 : const R600Subtarget *R600TargetLowering::getSubtarget() const {
     249      408878 :   return static_cast<const R600Subtarget *>(Subtarget);
     250             : }
     251             : 
     252        2422 : static inline bool isEOP(MachineBasicBlock::iterator I) {
     253        4844 :   if (std::next(I) == I->getParent()->end())
     254             :     return false;
     255        4828 :   return std::next(I)->getOpcode() == AMDGPU::RETURN;
     256             : }
     257             : 
     258             : MachineBasicBlock *
     259        8949 : R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     260             :                                                 MachineBasicBlock *BB) const {
     261        8949 :   MachineFunction *MF = BB->getParent();
     262        8949 :   MachineRegisterInfo &MRI = MF->getRegInfo();
     263             :   MachineBasicBlock::iterator I = MI;
     264        8949 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
     265             : 
     266       17898 :   switch (MI.getOpcode()) {
     267         858 :   default:
     268             :     // Replace LDS_*_RET instruction that don't have any uses with the
     269             :     // equivalent LDS_*_NORET instruction.
     270         858 :     if (TII->isLDSRetInstr(MI.getOpcode())) {
     271        1716 :       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
     272             :       assert(DstIdx != -1);
     273             :       MachineInstrBuilder NewMI;
     274             :       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
     275             :       //        LDS_1A2D support and remove this special case.
     276        2604 :       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
     277          30 :           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
     278             :         return BB;
     279             : 
     280          60 :       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
     281          30 :                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
     282         300 :       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
     283         270 :         NewMI.add(MI.getOperand(i));
     284             :       }
     285             :     } else {
     286           0 :       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     287             :     }
     288             :     break;
     289           0 :   case AMDGPU::CLAMP_R600: {
     290           0 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     291             :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     292           0 :         MI.getOperand(1).getReg());
     293           0 :     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
     294           0 :     break;
     295             :   }
     296             : 
     297          20 :   case AMDGPU::FABS_R600: {
     298          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     299             :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     300          40 :         MI.getOperand(1).getReg());
     301          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     302          20 :     break;
     303             :   }
     304             : 
     305          20 :   case AMDGPU::FNEG_R600: {
     306          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     307             :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     308          40 :         MI.getOperand(1).getReg());
     309          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     310          20 :     break;
     311             :   }
     312             : 
     313           0 :   case AMDGPU::MASK_WRITE: {
     314           0 :     unsigned maskedRegister = MI.getOperand(0).getReg();
     315             :     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     316           0 :     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     317           0 :     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     318           0 :     break;
     319             :   }
     320             : 
     321          17 :   case AMDGPU::MOV_IMM_F32:
     322          34 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
     323             :                                                             .getFPImm()
     324          17 :                                                             ->getValueAPF()
     325          34 :                                                             .bitcastToAPInt()
     326             :                                                             .getZExtValue());
     327          17 :     break;
     328             : 
     329         509 :   case AMDGPU::MOV_IMM_I32:
     330         509 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
     331         509 :                      MI.getOperand(1).getImm());
     332         509 :     break;
     333             : 
     334           1 :   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
     335             :     //TODO: Perhaps combine this instruction with the next if possible
     336             :     auto MIB = TII->buildDefaultInstruction(
     337           2 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
     338           1 :     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
     339             :     //TODO: Ugh this is rather ugly
     340           2 :     MIB->getOperand(Idx) = MI.getOperand(1);
     341             :     break;
     342             :   }
     343             : 
     344        2701 :   case AMDGPU::CONST_COPY: {
     345        5402 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     346        8103 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
     347        2701 :     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
     348        2701 :                        MI.getOperand(1).getImm());
     349        2701 :     break;
     350             :   }
     351             : 
     352        2360 :   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
     353             :   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
     354             :   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
     355        7080 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     356        2360 :         .add(MI.getOperand(0))
     357        2360 :         .add(MI.getOperand(1))
     358        2360 :         .addImm(isEOP(I)); // Set End of program bit
     359        2360 :     break;
     360             : 
     361           2 :   case AMDGPU::RAT_STORE_TYPED_eg:
     362           6 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     363           2 :         .add(MI.getOperand(0))
     364           2 :         .add(MI.getOperand(1))
     365           2 :         .add(MI.getOperand(2))
     366           2 :         .addImm(isEOP(I)); // Set End of program bit
     367           2 :     break;
     368             : 
     369         131 :   case AMDGPU::BRANCH:
     370         524 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
     371         131 :         .add(MI.getOperand(0));
     372         131 :     break;
     373             : 
     374           0 :   case AMDGPU::BRANCH_COND_f32: {
     375             :     MachineInstr *NewMI =
     376           0 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     377           0 :                 AMDGPU::PREDICATE_BIT)
     378           0 :             .add(MI.getOperand(1))
     379             :             .addImm(AMDGPU::PRED_SETNE)
     380             :             .addImm(0); // Flags
     381           0 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     382           0 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     383           0 :         .add(MI.getOperand(0))
     384           0 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     385           0 :     break;
     386             :   }
     387             : 
     388          84 :   case AMDGPU::BRANCH_COND_i32: {
     389             :     MachineInstr *NewMI =
     390         168 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     391          84 :                 AMDGPU::PREDICATE_BIT)
     392          84 :             .add(MI.getOperand(1))
     393             :             .addImm(AMDGPU::PRED_SETNE_INT)
     394             :             .addImm(0); // Flags
     395          84 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     396         336 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     397          84 :         .add(MI.getOperand(0))
     398          84 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     399          84 :     break;
     400             :   }
     401             : 
     402          60 :   case AMDGPU::EG_ExportSwz:
     403             :   case AMDGPU::R600_ExportSwz: {
     404             :     // Instruction is left unmodified if its not the last one of its type
     405             :     bool isLastInstructionOfItsType = true;
     406          60 :     unsigned InstExportType = MI.getOperand(1).getImm();
     407         129 :     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
     408         189 :          EndBlock = BB->end(); NextExportInst != EndBlock;
     409             :          NextExportInst = std::next(NextExportInst)) {
     410         276 :       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
     411             :           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
     412          23 :         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
     413          23 :             .getImm();
     414          23 :         if (CurrentInstExportType == InstExportType) {
     415             :           isLastInstructionOfItsType = false;
     416             :           break;
     417             :         }
     418             :       }
     419             :     }
     420          60 :     bool EOP = isEOP(I);
     421          60 :     if (!EOP && !isLastInstructionOfItsType)
     422             :       return BB;
     423         102 :     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
     424         153 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     425          51 :         .add(MI.getOperand(0))
     426          51 :         .add(MI.getOperand(1))
     427          51 :         .add(MI.getOperand(2))
     428          51 :         .add(MI.getOperand(3))
     429          51 :         .add(MI.getOperand(4))
     430          51 :         .add(MI.getOperand(5))
     431          51 :         .add(MI.getOperand(6))
     432          51 :         .addImm(CfInst)
     433          51 :         .addImm(EOP);
     434          51 :     break;
     435             :   }
     436             :   case AMDGPU::RETURN: {
     437             :     return BB;
     438             :   }
     439             :   }
     440             : 
     441        5926 :   MI.eraseFromParent();
     442        5926 :   return BB;
     443             : }
     444             : 
     445             : //===----------------------------------------------------------------------===//
     446             : // Custom DAG Lowering Operations
     447             : //===----------------------------------------------------------------------===//
     448             : 
     449      102431 : SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     450      102431 :   MachineFunction &MF = DAG.getMachineFunction();
     451      102431 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     452      102431 :   switch (Op.getOpcode()) {
     453         447 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     454       10924 :   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
     455           7 :   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
     456          50 :   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
     457          28 :   case ISD::SRA_PARTS:
     458          28 :   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
     459          64 :   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
     460         620 :   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
     461          17 :   case ISD::FCOS:
     462          17 :   case ISD::FSIN: return LowerTrig(Op, DAG);
     463       16358 :   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
     464       32944 :   case ISD::STORE: return LowerSTORE(Op, DAG);
     465       38512 :   case ISD::LOAD: {
     466       38512 :     SDValue Result = LowerLOAD(Op, DAG);
     467             :     assert((!Result.getNode() ||
     468             :             Result.getNode()->getNumValues() == 2) &&
     469             :            "Load should return a value and a chain");
     470       38512 :     return Result;
     471             :   }
     472             : 
     473          84 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
     474          57 :   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
     475        1590 :   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
     476          82 :   case ISD::INTRINSIC_VOID: {
     477          82 :     SDValue Chain = Op.getOperand(0);
     478             :     unsigned IntrinsicID =
     479         164 :                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     480          82 :     switch (IntrinsicID) {
     481          60 :     case AMDGPUIntrinsic::r600_store_swizzle: {
     482             :       SDLoc DL(Op);
     483             :       const SDValue Args[8] = {
     484             :         Chain,
     485             :         Op.getOperand(2), // Export Value
     486             :         Op.getOperand(3), // ArrayBase
     487             :         Op.getOperand(4), // Type
     488          60 :         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
     489          60 :         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
     490          60 :         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
     491          60 :         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
     492         360 :       };
     493          60 :       return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
     494             :     }
     495             : 
     496             :     // default for switch(IntrinsicID)
     497             :     default: break;
     498             :     }
     499             :     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
     500             :     break;
     501             :   }
     502         647 :   case ISD::INTRINSIC_WO_CHAIN: {
     503             :     unsigned IntrinsicID =
     504        1294 :                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     505         647 :     EVT VT = Op.getValueType();
     506             :     SDLoc DL(Op);
     507         647 :     switch (IntrinsicID) {
     508         276 :     case AMDGPUIntrinsic::r600_tex:
     509             :     case AMDGPUIntrinsic::r600_texc: {
     510             :       unsigned TextureOp;
     511         276 :       switch (IntrinsicID) {
     512             :       case AMDGPUIntrinsic::r600_tex:
     513             :         TextureOp = 0;
     514             :         break;
     515           7 :       case AMDGPUIntrinsic::r600_texc:
     516             :         TextureOp = 1;
     517           7 :         break;
     518           0 :       default:
     519           0 :         llvm_unreachable("unhandled texture operation");
     520             :       }
     521             : 
     522             :       SDValue TexArgs[19] = {
     523         552 :         DAG.getConstant(TextureOp, DL, MVT::i32),
     524             :         Op.getOperand(1),
     525         276 :         DAG.getConstant(0, DL, MVT::i32),
     526         276 :         DAG.getConstant(1, DL, MVT::i32),
     527         276 :         DAG.getConstant(2, DL, MVT::i32),
     528         276 :         DAG.getConstant(3, DL, MVT::i32),
     529             :         Op.getOperand(2),
     530             :         Op.getOperand(3),
     531             :         Op.getOperand(4),
     532         276 :         DAG.getConstant(0, DL, MVT::i32),
     533         276 :         DAG.getConstant(1, DL, MVT::i32),
     534         276 :         DAG.getConstant(2, DL, MVT::i32),
     535         276 :         DAG.getConstant(3, DL, MVT::i32),
     536             :         Op.getOperand(5),
     537             :         Op.getOperand(6),
     538             :         Op.getOperand(7),
     539             :         Op.getOperand(8),
     540             :         Op.getOperand(9),
     541             :         Op.getOperand(10)
     542        3312 :       };
     543         276 :       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     544             :     }
     545          32 :     case AMDGPUIntrinsic::r600_dot4: {
     546             :       SDValue Args[8] = {
     547             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     548          64 :           DAG.getConstant(0, DL, MVT::i32)),
     549             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     550          64 :           DAG.getConstant(0, DL, MVT::i32)),
     551             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     552          64 :           DAG.getConstant(1, DL, MVT::i32)),
     553             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     554          64 :           DAG.getConstant(1, DL, MVT::i32)),
     555             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     556          64 :           DAG.getConstant(2, DL, MVT::i32)),
     557             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     558          64 :           DAG.getConstant(2, DL, MVT::i32)),
     559             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     560          64 :           DAG.getConstant(3, DL, MVT::i32)),
     561             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     562          64 :           DAG.getConstant(3, DL, MVT::i32))
     563         256 :       };
     564          32 :       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     565             :     }
     566             : 
     567           2 :     case Intrinsic::r600_implicitarg_ptr: {
     568           2 :       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
     569           2 :       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
     570           2 :       return DAG.getConstant(ByteOffset, DL, PtrVT);
     571             :     }
     572           1 :     case Intrinsic::r600_read_ngroups_x:
     573           1 :       return LowerImplicitParameter(DAG, VT, DL, 0);
     574           1 :     case Intrinsic::r600_read_ngroups_y:
     575           1 :       return LowerImplicitParameter(DAG, VT, DL, 1);
     576           1 :     case Intrinsic::r600_read_ngroups_z:
     577           1 :       return LowerImplicitParameter(DAG, VT, DL, 2);
     578           2 :     case Intrinsic::r600_read_global_size_x:
     579           2 :       return LowerImplicitParameter(DAG, VT, DL, 3);
     580           2 :     case Intrinsic::r600_read_global_size_y:
     581           2 :       return LowerImplicitParameter(DAG, VT, DL, 4);
     582           2 :     case Intrinsic::r600_read_global_size_z:
     583           2 :       return LowerImplicitParameter(DAG, VT, DL, 5);
     584           8 :     case Intrinsic::r600_read_local_size_x:
     585           8 :       return LowerImplicitParameter(DAG, VT, DL, 6);
     586          36 :     case Intrinsic::r600_read_local_size_y:
     587          36 :       return LowerImplicitParameter(DAG, VT, DL, 7);
     588          36 :     case Intrinsic::r600_read_local_size_z:
     589          36 :       return LowerImplicitParameter(DAG, VT, DL, 8);
     590             : 
     591           4 :     case Intrinsic::r600_read_tgid_x:
     592             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     593           4 :                                      AMDGPU::T1_X, VT);
     594           3 :     case Intrinsic::r600_read_tgid_y:
     595             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     596           3 :                                      AMDGPU::T1_Y, VT);
     597           3 :     case Intrinsic::r600_read_tgid_z:
     598             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     599           3 :                                      AMDGPU::T1_Z, VT);
     600         162 :     case Intrinsic::r600_read_tidig_x:
     601             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     602         162 :                                      AMDGPU::T0_X, VT);
     603          32 :     case Intrinsic::r600_read_tidig_y:
     604             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     605          32 :                                      AMDGPU::T0_Y, VT);
     606          32 :     case Intrinsic::r600_read_tidig_z:
     607             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     608          32 :                                      AMDGPU::T0_Z, VT);
     609             : 
     610             :     case Intrinsic::r600_recipsqrt_ieee:
     611           3 :       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
     612             : 
     613             :     case Intrinsic::r600_recipsqrt_clamped:
     614           5 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     615           4 :     default:
     616           4 :       return Op;
     617             :     }
     618             : 
     619             :     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     620             :     break;
     621             :   }
     622             :   } // end switch(Op.getOpcode())
     623          22 :   return SDValue();
     624             : }
     625             : 
     626         105 : void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     627             :                                             SmallVectorImpl<SDValue> &Results,
     628             :                                             SelectionDAG &DAG) const {
     629         210 :   switch (N->getOpcode()) {
     630          53 :   default:
     631          53 :     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     632          53 :     return;
     633             :   case ISD::FP_TO_UINT:
     634           9 :     if (N->getValueType(0) == MVT::i1) {
     635           4 :       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
     636           2 :       return;
     637             :     }
     638             :     // Since we don't care about out of bounds values we can use FP_TO_SINT for
     639             :     // uints too. The DAGLegalizer code for uint considers some extra cases
     640             :     // which are not necessary here.
     641             :     LLVM_FALLTHROUGH;
     642             :   case ISD::FP_TO_SINT: {
     643          16 :     if (N->getValueType(0) == MVT::i1) {
     644           4 :       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
     645           2 :       return;
     646             :     }
     647             : 
     648          14 :     SDValue Result;
     649          14 :     if (expandFP_TO_SINT(N, Result, DAG))
     650          14 :       Results.push_back(Result);
     651             :     return;
     652             :   }
     653             :   case ISD::SDIVREM: {
     654             :     SDValue Op = SDValue(N, 1);
     655          12 :     SDValue RES = LowerSDIVREM(Op, DAG);
     656          12 :     Results.push_back(RES);
     657          12 :     Results.push_back(RES.getValue(1));
     658             :     break;
     659             :   }
     660             :   case ISD::UDIVREM: {
     661             :     SDValue Op = SDValue(N, 0);
     662          22 :     LowerUDIVREM64(Op, DAG, Results);
     663             :     break;
     664             :   }
     665             :   }
     666             : }
     667             : 
     668          16 : SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
     669             :                                                    SDValue Vector) const {
     670             :   SDLoc DL(Vector);
     671          16 :   EVT VecVT = Vector.getValueType();
     672          16 :   EVT EltVT = VecVT.getVectorElementType();
     673             :   SmallVector<SDValue, 8> Args;
     674             : 
     675          64 :   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
     676          48 :     Args.push_back(DAG.getNode(
     677             :         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
     678         192 :         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
     679             :   }
     680             : 
     681          32 :   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
     682             : }
     683             : 
     684       10924 : SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     685             :                                                     SelectionDAG &DAG) const {
     686             :   SDLoc DL(Op);
     687       10924 :   SDValue Vector = Op.getOperand(0);
     688       10924 :   SDValue Index = Op.getOperand(1);
     689             : 
     690          42 :   if (isa<ConstantSDNode>(Index) ||
     691             :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     692       10910 :     return Op;
     693             : 
     694          14 :   Vector = vectorToVerticalVector(DAG, Vector);
     695             :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
     696          14 :                      Vector, Index);
     697             : }
     698             : 
     699           7 : SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     700             :                                                    SelectionDAG &DAG) const {
     701             :   SDLoc DL(Op);
     702           7 :   SDValue Vector = Op.getOperand(0);
     703           7 :   SDValue Value = Op.getOperand(1);
     704           7 :   SDValue Index = Op.getOperand(2);
     705             : 
     706           3 :   if (isa<ConstantSDNode>(Index) ||
     707             :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     708           6 :     return Op;
     709             : 
     710           1 :   Vector = vectorToVerticalVector(DAG, Vector);
     711             :   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
     712           1 :                                Vector, Value, Index);
     713           1 :   return vectorToVerticalVector(DAG, Insert);
     714             : }
     715             : 
     716          57 : SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
     717             :                                                SDValue Op,
     718             :                                                SelectionDAG &DAG) const {
     719             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
     720          57 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     721          42 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
     722             : 
     723          15 :   const DataLayout &DL = DAG.getDataLayout();
     724          15 :   const GlobalValue *GV = GSD->getGlobal();
     725             :   MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
     726             : 
     727          30 :   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
     728          30 :   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
     729             : }
     730             : 
     731          17 : SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
     732             :   // On hw >= R700, COS/SIN input must be between -1. and 1.
     733             :   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
     734          17 :   EVT VT = Op.getValueType();
     735          17 :   SDValue Arg = Op.getOperand(0);
     736             :   SDLoc DL(Op);
     737             : 
     738             :   // TODO: Should this propagate fast-math-flags?
     739             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
     740             :       DAG.getNode(ISD::FADD, DL, VT,
     741             :         DAG.getNode(ISD::FMUL, DL, VT, Arg,
     742             :           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
     743          34 :         DAG.getConstantFP(0.5, DL, MVT::f32)));
     744             :   unsigned TrigNode;
     745          17 :   switch (Op.getOpcode()) {
     746             :   case ISD::FCOS:
     747             :     TrigNode = AMDGPUISD::COS_HW;
     748             :     break;
     749          11 :   case ISD::FSIN:
     750             :     TrigNode = AMDGPUISD::SIN_HW;
     751          11 :     break;
     752           0 :   default:
     753           0 :     llvm_unreachable("Wrong trig opcode");
     754             :   }
     755             :   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
     756             :       DAG.getNode(ISD::FADD, DL, VT, FractPart,
     757          17 :         DAG.getConstantFP(-0.5, DL, MVT::f32)));
     758          17 :   if (Gen >= R600Subtarget::R700)
     759          17 :     return TrigVal;
     760             :   // On R600 hw, COS/SIN input must be between -Pi and Pi.
     761             :   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
     762           0 :       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
     763             : }
     764             : 
     765          50 : SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
     766             :   SDLoc DL(Op);
     767          50 :   EVT VT = Op.getValueType();
     768             : 
     769          50 :   SDValue Lo = Op.getOperand(0);
     770          50 :   SDValue Hi = Op.getOperand(1);
     771          50 :   SDValue Shift = Op.getOperand(2);
     772          50 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     773          50 :   SDValue One  = DAG.getConstant(1, DL, VT);
     774             : 
     775          50 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     776          50 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     777          50 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     778          50 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     779             : 
     780             :   // The dance around Width1 is necessary for 0 special case.
     781             :   // Without it the CompShift might be 32, producing incorrect results in
     782             :   // Overflow. So we do the shift in two steps, the alternative is to
     783             :   // add a conditional to filter the special case.
     784             : 
     785          50 :   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
     786          50 :   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
     787             : 
     788          50 :   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
     789          50 :   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
     790          50 :   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
     791             : 
     792          50 :   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
     793          50 :   SDValue LoBig = Zero;
     794             : 
     795          50 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     796          50 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     797             : 
     798         100 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     799             : }
     800             : 
     801          28 : SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
     802             :   SDLoc DL(Op);
     803          28 :   EVT VT = Op.getValueType();
     804             : 
     805          28 :   SDValue Lo = Op.getOperand(0);
     806          28 :   SDValue Hi = Op.getOperand(1);
     807          28 :   SDValue Shift = Op.getOperand(2);
     808          28 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     809          28 :   SDValue One  = DAG.getConstant(1, DL, VT);
     810             : 
     811             :   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
     812             : 
     813          28 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     814          28 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     815          28 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     816          28 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     817             : 
     818             :   // The dance around Width1 is necessary for 0 special case.
     819             :   // Without it the CompShift might be 32, producing incorrect results in
     820             :   // Overflow. So we do the shift in two steps, the alternative is to
     821             :   // add a conditional to filter the special case.
     822             : 
     823          28 :   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
     824          28 :   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
     825             : 
     826          28 :   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
     827          28 :   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
     828          28 :   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
     829             : 
     830          28 :   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
     831          35 :   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
     832             : 
     833          28 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     834          28 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     835             : 
     836          56 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     837             : }
     838             : 
     839         684 : SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
     840             :                                           unsigned mainop, unsigned ovf) const {
     841             :   SDLoc DL(Op);
     842         684 :   EVT VT = Op.getValueType();
     843             : 
     844         684 :   SDValue Lo = Op.getOperand(0);
     845         684 :   SDValue Hi = Op.getOperand(1);
     846             : 
     847         684 :   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
     848             :   // Extend sign.
     849         684 :   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
     850        1368 :                     DAG.getValueType(MVT::i1));
     851             : 
     852         684 :   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
     853             : 
     854        1368 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
     855             : }
     856             : 
     857           2 : SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
     858             :   SDLoc DL(Op);
     859             :   return DAG.getNode(
     860             :       ISD::SETCC,
     861             :       DL,
     862             :       MVT::i1,
     863             :       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
     864           8 :       DAG.getCondCode(ISD::SETEQ));
     865             : }
     866             : 
     867           2 : SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
     868             :   SDLoc DL(Op);
     869             :   return DAG.getNode(
     870             :       ISD::SETCC,
     871             :       DL,
     872             :       MVT::i1,
     873             :       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
     874           8 :       DAG.getCondCode(ISD::SETEQ));
     875             : }
     876             : 
     877          89 : SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
     878             :                                                    const SDLoc &DL,
     879             :                                                    unsigned DwordOffset) const {
     880          89 :   unsigned ByteOffset = DwordOffset * 4;
     881          89 :   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
     882          89 :                                       AMDGPUASI.CONSTANT_BUFFER_0);
     883             : 
     884             :   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
     885             :   assert(isInt<16>(ByteOffset));
     886             : 
     887             :   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
     888             :                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
     889         267 :                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
     890             : }
     891             : 
     892       21260 : bool R600TargetLowering::isZero(SDValue Op) const {
     893             :   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     894        9772 :     return Cst->isNullValue();
     895             :   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
     896         440 :     return CstFP->isZero();
     897             :   } else {
     898             :     return false;
     899             :   }
     900             : }
     901             : 
     902       32686 : bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
     903             :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     904         936 :     return CFP->isExactlyValue(1.0);
     905             :   }
     906       32218 :   return isAllOnesConstant(Op);
     907             : }
     908             : 
     909        5892 : bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
     910             :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     911         310 :     return CFP->getValueAPF().isZero();
     912             :   }
     913        5737 :   return isNullConstant(Op);
     914             : }
     915             : 
     916       16358 : SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     917             :   SDLoc DL(Op);
     918       16358 :   EVT VT = Op.getValueType();
     919             : 
     920       16358 :   SDValue LHS = Op.getOperand(0);
     921       16358 :   SDValue RHS = Op.getOperand(1);
     922       16358 :   SDValue True = Op.getOperand(2);
     923       16358 :   SDValue False = Op.getOperand(3);
     924       16358 :   SDValue CC = Op.getOperand(4);
     925             :   SDValue Temp;
     926             : 
     927             :   if (VT == MVT::f32) {
     928             :     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
     929         379 :     SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     930         379 :     if (MinMax)
     931          19 :       return MinMax;
     932             :   }
     933             : 
     934             :   // LHS and RHS are guaranteed to be the same value type
     935       16339 :   EVT CompareVT = LHS.getValueType();
     936             : 
     937             :   // Check if we can lower this to a native operation.
     938             : 
     939             :   // Try to lower to a SET* instruction:
     940             :   //
     941             :   // SET* can match the following patterns:
     942             :   //
     943             :   // select_cc f32, f32, -1,  0, cc_supported
     944             :   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
     945             :   // select_cc i32, i32, -1,  0, cc_supported
     946             :   //
     947             : 
     948             :   // Move hardware True/False values to the correct operand.
     949       16339 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     950             :   ISD::CondCode InverseCC =
     951       16339 :      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
     952       16339 :   if (isHWTrueValue(False) && isHWFalseValue(True)) {
     953             :     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
     954             :       std::swap(False, True);
     955          11 :       CC = DAG.getCondCode(InverseCC);
     956             :     } else {
     957          71 :       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
     958             :       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
     959             :         std::swap(False, True);
     960             :         std::swap(LHS, RHS);
     961          60 :         CC = DAG.getCondCode(SwapInvCC);
     962             :       }
     963             :     }
     964             :   }
     965             : 
     966       16339 :   if (isHWTrueValue(True) && isHWFalseValue(False) &&
     967             :       (CompareVT == VT || VT == MVT::i32)) {
     968             :     // This can be matched by a SET* instruction.
     969        5709 :     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
     970             :   }
     971             : 
     972             :   // Try to lower to a CND* instruction:
     973             :   //
     974             :   // CND* can match the following patterns:
     975             :   //
     976             :   // select_cc f32, 0.0, f32, f32, cc_supported
     977             :   // select_cc f32, 0.0, i32, i32, cc_supported
     978             :   // select_cc i32, 0,   f32, f32, cc_supported
     979             :   // select_cc i32, 0,   i32, i32, cc_supported
     980             :   //
     981             : 
     982             :   // Try to move the zero value to the RHS
     983       10630 :   if (isZero(LHS)) {
     984           3 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     985             :     // Try swapping the operands
     986           3 :     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
     987             :     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     988             :       std::swap(LHS, RHS);
     989           0 :       CC = DAG.getCondCode(CCSwapped);
     990             :     } else {
     991             :       // Try inverting the conditon and then swapping the operands
     992           3 :       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
     993           3 :       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
     994             :       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     995             :         std::swap(True, False);
     996             :         std::swap(LHS, RHS);
     997           0 :         CC = DAG.getCondCode(CCSwapped);
     998             :       }
     999             :     }
    1000             :   }
    1001       10630 :   if (isZero(RHS)) {
    1002        9752 :     SDValue Cond = LHS;
    1003        9752 :     SDValue Zero = RHS;
    1004        9752 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    1005        9752 :     if (CompareVT != VT) {
    1006             :       // Bitcast True / False to the correct types.  This will end up being
    1007             :       // a nop, but it allows us to define only a single pattern in the
    1008             :       // .TD files for each CND* instruction rather than having to have
    1009             :       // one pattern for integer True/False and one for fp True/False
    1010          55 :       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
    1011          55 :       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
    1012             :     }
    1013             : 
    1014             :     switch (CCOpcode) {
    1015             :     case ISD::SETONE:
    1016             :     case ISD::SETUNE:
    1017             :     case ISD::SETNE:
    1018        2149 :       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
    1019             :       Temp = True;
    1020             :       True = False;
    1021             :       False = Temp;
    1022        2149 :       break;
    1023             :     default:
    1024             :       break;
    1025             :     }
    1026             :     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
    1027             :         Cond, Zero,
    1028             :         True, False,
    1029        9752 :         DAG.getCondCode(CCOpcode));
    1030        9752 :     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
    1031             :   }
    1032             : 
    1033             :   // If we make it this for it means we have no native instructions to handle
    1034             :   // this SELECT_CC, so we must lower it.
    1035         878 :   SDValue HWTrue, HWFalse;
    1036             : 
    1037             :   if (CompareVT == MVT::f32) {
    1038          60 :     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
    1039          60 :     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
    1040             :   } else if (CompareVT == MVT::i32) {
    1041         818 :     HWTrue = DAG.getConstant(-1, DL, CompareVT);
    1042         818 :     HWFalse = DAG.getConstant(0, DL, CompareVT);
    1043             :   }
    1044             :   else {
    1045           0 :     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
    1046             :   }
    1047             : 
    1048             :   // Lower this unsupported SELECT_CC into a combination of two supported
    1049             :   // SELECT_CC operations.
    1050         878 :   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
    1051             : 
    1052             :   return DAG.getNode(ISD::SELECT_CC, DL, VT,
    1053             :       Cond, HWFalse,
    1054             :       True, False,
    1055         878 :       DAG.getCondCode(ISD::SETNE));
    1056             : }
    1057             : 
    1058             : /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
    1059             : /// convert these pointers to a register index.  Each register holds
    1060             : /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
    1061             : /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
    1062             : /// for indirect addressing.
    1063           0 : SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
    1064             :                                                unsigned StackWidth,
    1065             :                                                SelectionDAG &DAG) const {
    1066             :   unsigned SRLPad;
    1067           0 :   switch(StackWidth) {
    1068             :   case 1:
    1069             :     SRLPad = 2;
    1070             :     break;
    1071           0 :   case 2:
    1072             :     SRLPad = 3;
    1073           0 :     break;
    1074           0 :   case 4:
    1075             :     SRLPad = 4;
    1076           0 :     break;
    1077           0 :   default: llvm_unreachable("Invalid stack width");
    1078             :   }
    1079             : 
    1080             :   SDLoc DL(Ptr);
    1081             :   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
    1082           0 :                      DAG.getConstant(SRLPad, DL, MVT::i32));
    1083             : }
    1084             : 
    1085           0 : void R600TargetLowering::getStackAddress(unsigned StackWidth,
    1086             :                                          unsigned ElemIdx,
    1087             :                                          unsigned &Channel,
    1088             :                                          unsigned &PtrIncr) const {
    1089           0 :   switch (StackWidth) {
    1090           0 :   default:
    1091             :   case 1:
    1092           0 :     Channel = 0;
    1093           0 :     if (ElemIdx > 0) {
    1094           0 :       PtrIncr = 1;
    1095             :     } else {
    1096           0 :       PtrIncr = 0;
    1097             :     }
    1098             :     break;
    1099           0 :   case 2:
    1100           0 :     Channel = ElemIdx % 2;
    1101           0 :     if (ElemIdx == 2) {
    1102           0 :       PtrIncr = 1;
    1103             :     } else {
    1104           0 :       PtrIncr = 0;
    1105             :     }
    1106             :     break;
    1107           0 :   case 4:
    1108           0 :     Channel = ElemIdx;
    1109           0 :     PtrIncr = 0;
    1110           0 :     break;
    1111             :   }
    1112           0 : }
    1113             : 
    1114        1283 : SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
    1115             :                                                    SelectionDAG &DAG) const {
    1116             :   SDLoc DL(Store);
    1117             :   //TODO: Who creates the i8 stores?
    1118             :   assert(Store->isTruncatingStore()
    1119             :          || Store->getValue().getValueType() == MVT::i8);
    1120             :   assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
    1121             : 
    1122        1283 :   SDValue Mask;
    1123             :   if (Store->getMemoryVT() == MVT::i8) {
    1124             :     assert(Store->getAlignment() >= 1);
    1125         591 :     Mask = DAG.getConstant(0xff, DL, MVT::i32);
    1126             :   } else if (Store->getMemoryVT() == MVT::i16) {
    1127             :     assert(Store->getAlignment() >= 2);
    1128         692 :     Mask = DAG.getConstant(0xffff, DL, MVT::i32);
    1129             :   } else {
    1130           0 :     llvm_unreachable("Unsupported private trunc store");
    1131             :   }
    1132             : 
    1133        1283 :   SDValue OldChain = Store->getChain();
    1134        1283 :   bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
    1135             :   // Skip dummy
    1136        2566 :   SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
    1137        1283 :   SDValue BasePtr = Store->getBasePtr();
    1138        1283 :   SDValue Offset = Store->getOffset();
    1139        1283 :   EVT MemVT = Store->getMemoryVT();
    1140             : 
    1141        1283 :   SDValue LoadPtr = BasePtr;
    1142        1283 :   if (!Offset.isUndef()) {
    1143           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1144             :   }
    1145             : 
    1146             :   // Get dword location
    1147             :   // TODO: this should be eliminated by the future SHR ptr, 2
    1148             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1149        2566 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1150             : 
    1151             :   // Load dword
    1152             :   // TODO: can we be smarter about machine pointer info?
    1153        1283 :   MachinePointerInfo PtrInfo(UndefValue::get(
    1154        1283 :       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
    1155        1283 :   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
    1156             : 
    1157        1283 :   Chain = Dst.getValue(1);
    1158             : 
    1159             :   // Get offset in dword
    1160             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1161        2566 :                                 DAG.getConstant(0x3, DL, MVT::i32));
    1162             : 
    1163             :   // Convert byte offset to bit shift
    1164             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1165        2566 :                                  DAG.getConstant(3, DL, MVT::i32));
    1166             : 
    1167             :   // TODO: Contrary to the name of the functiom,
    1168             :   // it also handles sub i32 non-truncating stores (like i1)
    1169             :   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
    1170        1283 :                                   Store->getValue());
    1171             : 
    1172             :   // Mask the value to the right type
    1173        1283 :   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
    1174             : 
    1175             :   // Shift the value in place
    1176             :   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
    1177        1283 :                                      MaskedValue, ShiftAmt);
    1178             : 
    1179             :   // Shift the mask in place
    1180        1283 :   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
    1181             : 
    1182             :   // Invert the mask. NOTE: if we had native ROL instructions we could
    1183             :   // use inverted mask
    1184        1283 :   DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
    1185             : 
    1186             :   // Cleanup the target bits
    1187        1283 :   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
    1188             : 
    1189             :   // Add the new bits
    1190        1283 :   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
    1191             : 
    1192             :   // Store dword
    1193             :   // TODO: Can we be smarter about MachinePointerInfo?
    1194        1283 :   SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo);
    1195             : 
    1196             :   // If we are part of expanded vector, make our neighbors depend on this store
    1197        1283 :   if (VectorTrunc) {
    1198             :     // Make all other vector elements depend on this store
    1199         838 :     Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
    1200         838 :     DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
    1201             :   }
    1202        2566 :   return NewStore;
    1203             : }
    1204             : 
    1205       32944 : SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    1206             :   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
    1207             :   unsigned AS = StoreNode->getAddressSpace();
    1208             : 
    1209       32944 :   SDValue Chain = StoreNode->getChain();
    1210       32944 :   SDValue Ptr = StoreNode->getBasePtr();
    1211       32944 :   SDValue Value = StoreNode->getValue();
    1212             : 
    1213       32944 :   EVT VT = Value.getValueType();
    1214       32944 :   EVT MemVT = StoreNode->getMemoryVT();
    1215       32944 :   EVT PtrVT = Ptr.getValueType();
    1216             : 
    1217             :   SDLoc DL(Op);
    1218             : 
    1219             :   // Neither LOCAL nor PRIVATE can do vectors at the moment
    1220       56776 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1221             :       VT.isVector()) {
    1222        1219 :     if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1223             :          StoreNode->isTruncatingStore()) {
    1224             :       // Add an extra level of chain to isolate this vector
    1225         235 :       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
    1226             :       // TODO: can the chain be replaced without creating a new store?
    1227             :       SDValue NewStore = DAG.getTruncStore(
    1228         235 :           NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
    1229             :           MemVT, StoreNode->getAlignment(),
    1230         705 :           StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
    1231             :       StoreNode = cast<StoreSDNode>(NewStore);
    1232             :     }
    1233             : 
    1234         924 :     return scalarizeVectorStore(StoreNode, DAG);
    1235             :   }
    1236             : 
    1237       32020 :   unsigned Align = StoreNode->getAlignment();
    1238       32255 :   if (Align < MemVT.getStoreSize() &&
    1239         235 :       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
    1240          24 :     return expandUnalignedStore(StoreNode, DAG);
    1241             :   }
    1242             : 
    1243             :   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
    1244       31996 :                                   DAG.getConstant(2, DL, PtrVT));
    1245             : 
    1246       31996 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
    1247             :     // It is beneficial to create MSKOR here instead of combiner to avoid
    1248             :     // artificial dependencies introduced by RMW
    1249        9106 :     if (StoreNode->isTruncatingStore()) {
    1250             :       assert(VT.bitsLE(MVT::i32));
    1251         203 :       SDValue MaskConstant;
    1252             :       if (MemVT == MVT::i8) {
    1253         115 :         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
    1254             :       } else {
    1255             :         assert(MemVT == MVT::i16);
    1256             :         assert(StoreNode->getAlignment() >= 2);
    1257          88 :         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
    1258             :       }
    1259             : 
    1260             :       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
    1261         203 :                                       DAG.getConstant(0x00000003, DL, PtrVT));
    1262             :       SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
    1263         203 :                                      DAG.getConstant(3, DL, VT));
    1264             : 
    1265             :       // Put the mask in correct place
    1266         203 :       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
    1267             : 
    1268             :       // Put the value bits in correct place
    1269         203 :       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
    1270         203 :       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
    1271             : 
    1272             :       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
    1273             :       // vector instead.
    1274             :       SDValue Src[4] = {
    1275             :         ShiftedValue,
    1276         203 :         DAG.getConstant(0, DL, MVT::i32),
    1277         203 :         DAG.getConstant(0, DL, MVT::i32),
    1278             :         Mask
    1279         812 :       };
    1280         203 :       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
    1281         203 :       SDValue Args[3] = { Chain, Input, DWordAddr };
    1282             :       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
    1283             :                                      Op->getVTList(), Args, MemVT,
    1284         609 :                                      StoreNode->getMemOperand());
    1285       11780 :     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
    1286             :       // Convert pointer from byte address to dword address.
    1287        2842 :       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1288             : 
    1289        5684 :       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
    1290           0 :         llvm_unreachable("Truncated and indexed stores not supported yet");
    1291             :       } else {
    1292        2842 :         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1293             :       }
    1294        2842 :       return Chain;
    1295             :     }
    1296             :   }
    1297             : 
    1298             :   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
    1299       28951 :   if (AS != AMDGPUASI.PRIVATE_ADDRESS)
    1300       18478 :     return SDValue();
    1301             : 
    1302       10473 :   if (MemVT.bitsLT(MVT::i32))
    1303        1283 :     return lowerPrivateTruncStore(StoreNode, DAG);
    1304             : 
    1305             :   // Standard i32+ store, tag it with DWORDADDR to note that the address
    1306             :   // has been shifted
    1307        9190 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1308        2731 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1309        2731 :     return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1310             :   }
    1311             : 
    1312             :   // Tagged i32+ stores will be matched by patterns
    1313        6459 :   return SDValue();
    1314             : }
    1315             : 
    1316             : // return (512 + (kc_bank << 12)
    1317             : static int
    1318             : ConstantAddressBlock(unsigned AddressSpace) {
    1319             :   switch (AddressSpace) {
    1320             :   case AMDGPUAS::CONSTANT_BUFFER_0:
    1321             :     return 512;
    1322             :   case AMDGPUAS::CONSTANT_BUFFER_1:
    1323             :     return 512 + 4096;
    1324             :   case AMDGPUAS::CONSTANT_BUFFER_2:
    1325             :     return 512 + 4096 * 2;
    1326             :   case AMDGPUAS::CONSTANT_BUFFER_3:
    1327             :     return 512 + 4096 * 3;
    1328             :   case AMDGPUAS::CONSTANT_BUFFER_4:
    1329             :     return 512 + 4096 * 4;
    1330             :   case AMDGPUAS::CONSTANT_BUFFER_5:
    1331             :     return 512 + 4096 * 5;
    1332             :   case AMDGPUAS::CONSTANT_BUFFER_6:
    1333             :     return 512 + 4096 * 6;
    1334             :   case AMDGPUAS::CONSTANT_BUFFER_7:
    1335             :     return 512 + 4096 * 7;
    1336             :   case AMDGPUAS::CONSTANT_BUFFER_8:
    1337             :     return 512 + 4096 * 8;
    1338             :   case AMDGPUAS::CONSTANT_BUFFER_9:
    1339             :     return 512 + 4096 * 9;
    1340             :   case AMDGPUAS::CONSTANT_BUFFER_10:
    1341             :     return 512 + 4096 * 10;
    1342             :   case AMDGPUAS::CONSTANT_BUFFER_11:
    1343             :     return 512 + 4096 * 11;
    1344             :   case AMDGPUAS::CONSTANT_BUFFER_12:
    1345             :     return 512 + 4096 * 12;
    1346             :   case AMDGPUAS::CONSTANT_BUFFER_13:
    1347             :     return 512 + 4096 * 13;
    1348             :   case AMDGPUAS::CONSTANT_BUFFER_14:
    1349             :     return 512 + 4096 * 14;
    1350             :   case AMDGPUAS::CONSTANT_BUFFER_15:
    1351             :     return 512 + 4096 * 15;
    1352             :   default:
    1353             :     return -1;
    1354             :   }
    1355             : }
    1356             : 
    1357        4024 : SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
    1358             :                                                 SelectionDAG &DAG) const {
    1359             :   SDLoc DL(Op);
    1360             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1361             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    1362        4024 :   EVT MemVT = Load->getMemoryVT();
    1363             :   assert(Load->getAlignment() >= MemVT.getStoreSize());
    1364             : 
    1365        4024 :   SDValue BasePtr = Load->getBasePtr();
    1366        4024 :   SDValue Chain = Load->getChain();
    1367        4024 :   SDValue Offset = Load->getOffset();
    1368             : 
    1369        4024 :   SDValue LoadPtr = BasePtr;
    1370        4024 :   if (!Offset.isUndef()) {
    1371           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1372             :   }
    1373             : 
    1374             :   // Get dword location
    1375             :   // NOTE: this should be eliminated by the future SHR ptr, 2
    1376             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1377        8048 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1378             : 
    1379             :   // Load dword
    1380             :   // TODO: can we be smarter about machine pointer info?
    1381        4024 :   MachinePointerInfo PtrInfo(UndefValue::get(
    1382        4024 :       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
    1383        4024 :   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
    1384             : 
    1385             :   // Get offset within the register.
    1386             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
    1387        8048 :                                 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
    1388             : 
    1389             :   // Bit offset of target byte (byteIdx * 8).
    1390             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1391        8048 :                                  DAG.getConstant(3, DL, MVT::i32));
    1392             : 
    1393             :   // Shift to the right.
    1394        4024 :   SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
    1395             : 
    1396             :   // Eliminate the upper bits by setting them to ...
    1397        4024 :   EVT MemEltVT = MemVT.getScalarType();
    1398             : 
    1399        4024 :   if (ExtType == ISD::SEXTLOAD) { // ... ones.
    1400        1280 :     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
    1401        1280 :     Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
    1402             :   } else { // ... or zeros.
    1403        2744 :     Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
    1404             :   }
    1405             : 
    1406             :   SDValue Ops[] = {
    1407             :     Ret,
    1408             :     Read.getValue(1) // This should be our output chain
    1409        4024 :   };
    1410             : 
    1411        8048 :   return DAG.getMergeValues(Ops, DL);
    1412             : }
    1413             : 
    1414       38512 : SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    1415             :   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
    1416             :   unsigned AS = LoadNode->getAddressSpace();
    1417       38512 :   EVT MemVT = LoadNode->getMemoryVT();
    1418             :   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
    1419             : 
    1420       63807 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
    1421       77024 :       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
    1422        4024 :     return lowerPrivateExtLoad(Op, DAG);
    1423             :   }
    1424             : 
    1425             :   SDLoc DL(Op);
    1426       34488 :   EVT VT = Op.getValueType();
    1427       34488 :   SDValue Chain = LoadNode->getChain();
    1428       34488 :   SDValue Ptr = LoadNode->getBasePtr();
    1429             : 
    1430       30131 :   if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
    1431       90247 :       LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
    1432             :       VT.isVector()) {
    1433         361 :       return scalarizeVectorLoad(LoadNode, DAG);
    1434             :   }
    1435             : 
    1436             :   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
    1437       12022 :   if (ConstantBlock > -1 &&
    1438         962 :       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
    1439             :        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
    1440             :     SDValue Result;
    1441        9860 :     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
    1442        5056 :         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
    1443             :         isa<ConstantSDNode>(Ptr)) {
    1444        5056 :       SDValue Slots[4];
    1445       45504 :       for (unsigned i = 0; i < 4; i++) {
    1446             :         // We want Const position encoded with the following formula :
    1447             :         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
    1448             :         // const_index is Ptr computed by llvm using an alignment of 16.
    1449             :         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
    1450             :         // then div by 4 at the ISel step
    1451             :         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
    1452       40448 :             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
    1453       20224 :         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
    1454             :       }
    1455        5056 :       EVT NewVT = MVT::v4i32;
    1456             :       unsigned NumElements = 4;
    1457        5056 :       if (VT.isVector()) {
    1458         541 :         NewVT = VT;
    1459         541 :         NumElements = VT.getVectorNumElements();
    1460             :       }
    1461        5056 :       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
    1462             :     } else {
    1463             :       // non-constant ptr can't be folded, keeps it as a v4f32 load
    1464           0 :       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
    1465             :           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
    1466             :                       DAG.getConstant(4, DL, MVT::i32)),
    1467           0 :                       DAG.getConstant(LoadNode->getAddressSpace() -
    1468             :                                       AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
    1469           0 :           );
    1470             :     }
    1471             : 
    1472        5056 :     if (!VT.isVector()) {
    1473        4515 :       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
    1474        9030 :                            DAG.getConstant(0, DL, MVT::i32));
    1475             :     }
    1476             : 
    1477             :     SDValue MergedValues[2] = {
    1478             :       Result,
    1479             :       Chain
    1480        5056 :     };
    1481        5056 :     return DAG.getMergeValues(MergedValues, DL);
    1482             :   }
    1483             : 
    1484             :   // For most operations returning SDValue() will result in the node being
    1485             :   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
    1486             :   // need to manually expand loads that may be legal in some address spaces and
    1487             :   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
    1488             :   // compute shaders, since the data is sign extended when it is uploaded to the
    1489             :   // buffer. However SEXT loads from other address spaces are not supported, so
    1490             :   // we need to expand them here.
    1491       29071 :   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
    1492         290 :     EVT MemVT = LoadNode->getMemoryVT();
    1493             :     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
    1494             :     SDValue NewLoad = DAG.getExtLoad(
    1495         290 :         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
    1496         580 :         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
    1497             :     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
    1498         290 :                               DAG.getValueType(MemVT));
    1499             : 
    1500         290 :     SDValue MergedValues[2] = { Res, Chain };
    1501         290 :     return DAG.getMergeValues(MergedValues, DL);
    1502             :   }
    1503             : 
    1504       28781 :   if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
    1505        7688 :     return SDValue();
    1506             :   }
    1507             : 
    1508             :   // DWORDADDR ISD marks already shifted address
    1509       21093 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1510             :     assert(VT == MVT::i32);
    1511       11644 :     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
    1512        5822 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
    1513       11644 :     return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
    1514             :   }
    1515       15271 :   return SDValue();
    1516             : }
    1517             : 
    1518          84 : SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    1519          84 :   SDValue Chain = Op.getOperand(0);
    1520          84 :   SDValue Cond  = Op.getOperand(1);
    1521          84 :   SDValue Jump  = Op.getOperand(2);
    1522             : 
    1523          84 :   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
    1524         168 :                      Chain, Jump, Cond);
    1525             : }
    1526             : 
    1527        1590 : SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
    1528             :                                             SelectionDAG &DAG) const {
    1529        1590 :   MachineFunction &MF = DAG.getMachineFunction();
    1530        1590 :   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
    1531             : 
    1532             :   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
    1533             : 
    1534        1590 :   unsigned FrameIndex = FIN->getIndex();
    1535             :   unsigned IgnoredFrameReg;
    1536             :   unsigned Offset =
    1537        1590 :     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
    1538        3180 :   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
    1539        3180 :                          Op.getValueType());
    1540             : }
    1541             : 
    1542             : /// XXX Only kernel functions are supported, so we can assume for now that
    1543             : /// every function is a kernel function, but in the future we should use
    1544             : /// separate calling conventions for kernel and non-kernel functions.
    1545        2189 : SDValue R600TargetLowering::LowerFormalArguments(
    1546             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1547             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1548             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1549             :   SmallVector<CCValAssign, 16> ArgLocs;
    1550             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1551        4378 :                  *DAG.getContext());
    1552        2189 :   MachineFunction &MF = DAG.getMachineFunction();
    1553        2189 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    1554             : 
    1555             :   SmallVector<ISD::InputArg, 8> LocalIns;
    1556             : 
    1557        2189 :   if (AMDGPU::isShader(CallConv)) {
    1558          47 :     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
    1559             :   } else {
    1560        2142 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1561             :   }
    1562             : 
    1563        7733 :   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
    1564        5544 :     CCValAssign &VA = ArgLocs[i];
    1565             :     const ISD::InputArg &In = Ins[i];
    1566             :     EVT VT = In.VT;
    1567             :     EVT MemVT = VA.getLocVT();
    1568       10726 :     if (!VT.isVector() && MemVT.isVector()) {
    1569             :       // Get load source type if scalarized.
    1570           0 :       MemVT = MemVT.getVectorElementType();
    1571             :     }
    1572             : 
    1573        5544 :     if (AMDGPU::isShader(CallConv)) {
    1574          63 :       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
    1575          63 :       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1576          63 :       InVals.push_back(Register);
    1577          63 :       continue;
    1578             :     }
    1579             : 
    1580        5481 :     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
    1581        5481 :                                           AMDGPUASI.CONSTANT_BUFFER_0);
    1582             : 
    1583             :     // i64 isn't a legal type, so the register type used ends up as i32, which
    1584             :     // isn't expected here. It attempts to create this sextload, but it ends up
    1585             :     // being invalid. Somehow this seems to work with i64 arguments, but breaks
    1586             :     // for <1 x i64>.
    1587             : 
    1588             :     // The first 36 bytes of the input buffer contains information about
    1589             :     // thread group and global sizes.
    1590             :     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
    1591        5481 :     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
    1592             :       // FIXME: This should really check the extload type, but the handling of
    1593             :       // extload vector parameters seems to be broken.
    1594             : 
    1595             :       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
    1596             :       Ext = ISD::SEXTLOAD;
    1597             :     }
    1598             : 
    1599             :     // Compute the offset from the value.
    1600             :     // XXX - I think PartOffset should give you this, but it seems to give the
    1601             :     // size of the register which isn't useful.
    1602             : 
    1603       10962 :     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
    1604        5481 :     unsigned PartOffset = VA.getLocMemOffset();
    1605       10962 :     unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
    1606             : 
    1607        5481 :     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
    1608             :     SDValue Arg = DAG.getLoad(
    1609             :         ISD::UNINDEXED, Ext, VT, DL, Chain,
    1610             :         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
    1611             :         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
    1612             :                                         MachineMemOperand::MODereferenceable |
    1613       10962 :                                         MachineMemOperand::MOInvariant);
    1614             : 
    1615             :     // 4 is the preferred alignment for the CONSTANT memory space.
    1616        5481 :     InVals.push_back(Arg);
    1617        5481 :     MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
    1618             :   }
    1619        4378 :   return Chain;
    1620             : }
    1621             : 
    1622       34812 : EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    1623             :                                            EVT VT) const {
    1624       34812 :    if (!VT.isVector())
    1625       34774 :      return MVT::i32;
    1626          38 :    return VT.changeVectorElementTypeToInteger();
    1627             : }
    1628             : 
    1629         109 : bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
    1630             :                                           const SelectionDAG &DAG) const {
    1631             :   // Local and Private addresses do not handle vectors. Limit to i32
    1632         109 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
    1633         108 :     return (MemVT.getSizeInBits() <= 32);
    1634             :   }
    1635             :   return true;
    1636             : }
    1637             : 
    1638         837 : bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    1639             :                                                         unsigned AddrSpace,
    1640             :                                                         unsigned Align,
    1641             :                                                         bool *IsFast) const {
    1642         837 :   if (IsFast)
    1643         602 :     *IsFast = false;
    1644             : 
    1645         837 :   if (!VT.isSimple() || VT == MVT::Other)
    1646             :     return false;
    1647             : 
    1648         836 :   if (VT.bitsLT(MVT::i32))
    1649             :     return false;
    1650             : 
    1651             :   // TODO: This is a rough estimate.
    1652         801 :   if (IsFast)
    1653         590 :     *IsFast = true;
    1654             : 
    1655         801 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1656             : }
    1657             : 
    1658         392 : static SDValue CompactSwizzlableVector(
    1659             :   SelectionDAG &DAG, SDValue VectorEntry,
    1660             :   DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1661             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1662             :   assert(RemapSwizzle.empty());
    1663             :   SDValue NewBldVec[4] = {
    1664             :     VectorEntry.getOperand(0),
    1665             :     VectorEntry.getOperand(1),
    1666             :     VectorEntry.getOperand(2),
    1667             :     VectorEntry.getOperand(3)
    1668         392 :   };
    1669             : 
    1670        1960 :   for (unsigned i = 0; i < 4; i++) {
    1671        3136 :     if (NewBldVec[i].isUndef())
    1672             :       // We mask write here to teach later passes that the ith element of this
    1673             :       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
    1674             :       // break false dependencies and additionnaly make assembly easier to read.
    1675         212 :       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
    1676        1568 :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
    1677          56 :       if (C->isZero()) {
    1678          27 :         RemapSwizzle[i] = 4; // SEL_0
    1679          27 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1680           1 :       } else if (C->isExactlyValue(1.0)) {
    1681           1 :         RemapSwizzle[i] = 5; // SEL_1
    1682           1 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1683             :       }
    1684             :     }
    1685             : 
    1686        3136 :     if (NewBldVec[i].isUndef())
    1687             :       continue;
    1688        4974 :     for (unsigned j = 0; j < i; j++) {
    1689             :       if (NewBldVec[i] == NewBldVec[j]) {
    1690          32 :         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
    1691          16 :         RemapSwizzle[i] = j;
    1692             :         break;
    1693             :       }
    1694             :     }
    1695             :   }
    1696             : 
    1697         392 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1698         784 :                             NewBldVec);
    1699             : }
    1700             : 
    1701         392 : static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
    1702             :                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1703             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1704             :   assert(RemapSwizzle.empty());
    1705             :   SDValue NewBldVec[4] = {
    1706             :       VectorEntry.getOperand(0),
    1707             :       VectorEntry.getOperand(1),
    1708             :       VectorEntry.getOperand(2),
    1709             :       VectorEntry.getOperand(3)
    1710         392 :   };
    1711         392 :   bool isUnmovable[4] = { false, false, false, false };
    1712        1960 :   for (unsigned i = 0; i < 4; i++) {
    1713        1568 :     RemapSwizzle[i] = i;
    1714        3136 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1715             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1716          94 :           ->getZExtValue();
    1717          47 :       if (i == Idx)
    1718          39 :         isUnmovable[Idx] = true;
    1719             :     }
    1720             :   }
    1721             : 
    1722        1949 :   for (unsigned i = 0; i < 4; i++) {
    1723        3126 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1724             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1725          92 :           ->getZExtValue();
    1726          46 :       if (isUnmovable[Idx])
    1727          40 :         continue;
    1728             :       // Swap i and Idx
    1729           6 :       std::swap(NewBldVec[Idx], NewBldVec[i]);
    1730             :       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
    1731           6 :       break;
    1732             :     }
    1733             :   }
    1734             : 
    1735         392 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1736         784 :                             NewBldVec);
    1737             : }
    1738             : 
    1739         392 : SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
    1740             :                                             SelectionDAG &DAG,
    1741             :                                             const SDLoc &DL) const {
    1742             :   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
    1743             :   // Old -> New swizzle values
    1744             :   DenseMap<unsigned, unsigned> SwizzleRemap;
    1745             : 
    1746         392 :   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
    1747        3528 :   for (unsigned i = 0; i < 4; i++) {
    1748        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1749        1568 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1750         119 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1751             :   }
    1752             : 
    1753         392 :   SwizzleRemap.clear();
    1754         392 :   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
    1755        3528 :   for (unsigned i = 0; i < 4; i++) {
    1756        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1757        1568 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1758        1350 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1759             :   }
    1760             : 
    1761         784 :   return BuildVector;
    1762             : }
    1763             : 
    1764             : //===----------------------------------------------------------------------===//
    1765             : // Custom DAG Optimizations
    1766             : //===----------------------------------------------------------------------===//
    1767             : 
    1768      198253 : SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
    1769             :                                               DAGCombinerInfo &DCI) const {
    1770      198253 :   SelectionDAG &DAG = DCI.DAG;
    1771             :   SDLoc DL(N);
    1772             : 
    1773      396506 :   switch (N->getOpcode()) {
    1774             :   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
    1775           2 :   case ISD::FP_ROUND: {
    1776           2 :       SDValue Arg = N->getOperand(0);
    1777           2 :       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
    1778             :         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
    1779           2 :                            Arg.getOperand(0));
    1780             :       }
    1781             :       break;
    1782             :     }
    1783             : 
    1784             :   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
    1785             :   // (i32 select_cc f32, f32, -1, 0 cc)
    1786             :   //
    1787             :   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
    1788             :   // this to one of the SET*_DX10 instructions.
    1789          58 :   case ISD::FP_TO_SINT: {
    1790          58 :     SDValue FNeg = N->getOperand(0);
    1791          58 :     if (FNeg.getOpcode() != ISD::FNEG) {
    1792          42 :       return SDValue();
    1793             :     }
    1794          16 :     SDValue SelectCC = FNeg.getOperand(0);
    1795             :     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
    1796           8 :         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
    1797          16 :         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
    1798          32 :         !isHWTrueValue(SelectCC.getOperand(2)) ||
    1799           8 :         !isHWFalseValue(SelectCC.getOperand(3))) {
    1800           8 :       return SDValue();
    1801             :     }
    1802             : 
    1803             :     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
    1804             :                            SelectCC.getOperand(0), // LHS
    1805             :                            SelectCC.getOperand(1), // RHS
    1806             :                            DAG.getConstant(-1, DL, MVT::i32), // True
    1807             :                            DAG.getConstant(0, DL, MVT::i32),  // False
    1808          32 :                            SelectCC.getOperand(4)); // CC
    1809             : 
    1810             :     break;
    1811             :   }
    1812             : 
    1813             :   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
    1814             :   // => build_vector elt0, ... , NewEltIdx, ... , eltN
    1815         276 :   case ISD::INSERT_VECTOR_ELT: {
    1816         276 :     SDValue InVec = N->getOperand(0);
    1817         276 :     SDValue InVal = N->getOperand(1);
    1818         276 :     SDValue EltNo = N->getOperand(2);
    1819             : 
    1820             :     // If the inserted element is an UNDEF, just use the input vector.
    1821         276 :     if (InVal.isUndef())
    1822           0 :       return InVec;
    1823             : 
    1824         276 :     EVT VT = InVec.getValueType();
    1825             : 
    1826             :     // If we can't generate a legal BUILD_VECTOR, exit
    1827             :     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
    1828           8 :       return SDValue();
    1829             : 
    1830             :     // Check that we know which element is being inserted
    1831             :     if (!isa<ConstantSDNode>(EltNo))
    1832           2 :       return SDValue();
    1833         266 :     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
    1834             : 
    1835             :     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
    1836             :     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
    1837             :     // vector elements.
    1838             :     SmallVector<SDValue, 8> Ops;
    1839         266 :     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
    1840           8 :       Ops.append(InVec.getNode()->op_begin(),
    1841             :                  InVec.getNode()->op_end());
    1842         262 :     } else if (InVec.isUndef()) {
    1843           0 :       unsigned NElts = VT.getVectorNumElements();
    1844           0 :       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
    1845             :     } else {
    1846         262 :       return SDValue();
    1847             :     }
    1848             : 
    1849             :     // Insert the element
    1850           4 :     if (Elt < Ops.size()) {
    1851             :       // All the operands of BUILD_VECTOR must have the same type;
    1852             :       // we enforce that here.
    1853           8 :       EVT OpVT = Ops[0].getValueType();
    1854           0 :       if (InVal.getValueType() != OpVT)
    1855           0 :         InVal = OpVT.bitsGT(InVal.getValueType()) ?
    1856           0 :           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
    1857           0 :           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
    1858           4 :       Ops[Elt] = InVal;
    1859             :     }
    1860             : 
    1861             :     // Return the new vector
    1862           4 :     return DAG.getBuildVector(VT, DL, Ops);
    1863             :   }
    1864             : 
    1865             :   // Extract_vec (Build_vector) generated by custom lowering
    1866             :   // also needs to be customly combined
    1867       11572 :   case ISD::EXTRACT_VECTOR_ELT: {
    1868       11572 :     SDValue Arg = N->getOperand(0);
    1869       11572 :     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
    1870             :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1871           0 :         unsigned Element = Const->getZExtValue();
    1872           0 :         return Arg->getOperand(Element);
    1873             :       }
    1874             :     }
    1875         202 :     if (Arg.getOpcode() == ISD::BITCAST &&
    1876       11804 :         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
    1877       11632 :         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
    1878       11602 :          Arg.getValueType().getVectorNumElements())) {
    1879             :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1880          29 :         unsigned Element = Const->getZExtValue();
    1881             :         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
    1882          58 :                            Arg->getOperand(0).getOperand(Element));
    1883             :       }
    1884             :     }
    1885             :     break;
    1886             :   }
    1887             : 
    1888       11332 :   case ISD::SELECT_CC: {
    1889             :     // Try common optimizations
    1890       11332 :     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
    1891           0 :       return Ret;
    1892             : 
    1893             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
    1894             :     //      selectcc x, y, a, b, inv(cc)
    1895             :     //
    1896             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
    1897             :     //      selectcc x, y, a, b, cc
    1898       11332 :     SDValue LHS = N->getOperand(0);
    1899       11332 :     if (LHS.getOpcode() != ISD::SELECT_CC) {
    1900        5587 :       return SDValue();
    1901             :     }
    1902             : 
    1903        5745 :     SDValue RHS = N->getOperand(1);
    1904        5745 :     SDValue True = N->getOperand(2);
    1905        5745 :     SDValue False = N->getOperand(3);
    1906        5745 :     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
    1907             : 
    1908        5771 :     if (LHS.getOperand(2).getNode() != True.getNode() ||
    1909        5757 :         LHS.getOperand(3).getNode() != False.getNode() ||
    1910             :         RHS.getNode() != False.getNode()) {
    1911        5741 :       return SDValue();
    1912             :     }
    1913             : 
    1914           4 :     switch (NCC) {
    1915           0 :     default: return SDValue();
    1916           1 :     case ISD::SETNE: return LHS;
    1917           3 :     case ISD::SETEQ: {
    1918           3 :       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
    1919           3 :       LHSCC = ISD::getSetCCInverse(LHSCC,
    1920           9 :                                   LHS.getOperand(0).getValueType().isInteger());
    1921           6 :       if (DCI.isBeforeLegalizeOps() ||
    1922             :           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
    1923             :         return DAG.getSelectCC(DL,
    1924             :                                LHS.getOperand(0),
    1925             :                                LHS.getOperand(1),
    1926             :                                LHS.getOperand(2),
    1927             :                                LHS.getOperand(3),
    1928           0 :                                LHSCC);
    1929             :       break;
    1930             :     }
    1931             :     }
    1932           3 :     return SDValue();
    1933             :   }
    1934             : 
    1935         138 :   case AMDGPUISD::R600_EXPORT: {
    1936         138 :     SDValue Arg = N->getOperand(1);
    1937         138 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1938             :       break;
    1939             : 
    1940             :     SDValue NewArgs[8] = {
    1941             :       N->getOperand(0), // Chain
    1942             :       SDValue(),
    1943             :       N->getOperand(2), // ArrayBase
    1944             :       N->getOperand(3), // Type
    1945             :       N->getOperand(4), // SWZ_X
    1946             :       N->getOperand(5), // SWZ_Y
    1947             :       N->getOperand(6), // SWZ_Z
    1948             :       N->getOperand(7) // SWZ_W
    1949         120 :     };
    1950         120 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
    1951         240 :     return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
    1952             :   }
    1953         296 :   case AMDGPUISD::TEXTURE_FETCH: {
    1954         296 :     SDValue Arg = N->getOperand(1);
    1955         296 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1956             :       break;
    1957             : 
    1958             :     SDValue NewArgs[19] = {
    1959             :       N->getOperand(0),
    1960             :       N->getOperand(1),
    1961             :       N->getOperand(2),
    1962             :       N->getOperand(3),
    1963             :       N->getOperand(4),
    1964             :       N->getOperand(5),
    1965             :       N->getOperand(6),
    1966             :       N->getOperand(7),
    1967             :       N->getOperand(8),
    1968             :       N->getOperand(9),
    1969             :       N->getOperand(10),
    1970             :       N->getOperand(11),
    1971             :       N->getOperand(12),
    1972             :       N->getOperand(13),
    1973             :       N->getOperand(14),
    1974             :       N->getOperand(15),
    1975             :       N->getOperand(16),
    1976             :       N->getOperand(17),
    1977             :       N->getOperand(18),
    1978         272 :     };
    1979         272 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
    1980         544 :     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
    1981             :   }
    1982             :   default: break;
    1983             :   }
    1984             : 
    1985      186165 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    1986             : }
    1987             : 
    1988      238191 : bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
    1989             :                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
    1990             :                                      SDValue &Sel, SDValue &Imm,
    1991             :                                      SelectionDAG &DAG) const {
    1992      238191 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    1993      476382 :   if (!Src.isMachineOpcode())
    1994             :     return false;
    1995             : 
    1996      153070 :   switch (Src.getMachineOpcode()) {
    1997         117 :   case AMDGPU::FNEG_R600:
    1998         117 :     if (!Neg.getNode())
    1999             :       return false;
    2000          93 :     Src = Src.getOperand(0);
    2001         186 :     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    2002          93 :     return true;
    2003         109 :   case AMDGPU::FABS_R600:
    2004         109 :     if (!Abs.getNode())
    2005             :       return false;
    2006          93 :     Src = Src.getOperand(0);
    2007         186 :     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    2008          93 :     return true;
    2009       10497 :   case AMDGPU::CONST_COPY: {
    2010       10497 :     unsigned Opcode = ParentNode->getMachineOpcode();
    2011       10497 :     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2012             : 
    2013       10497 :     if (!Sel.getNode())
    2014             :       return false;
    2015             : 
    2016       18304 :     SDValue CstOffset = Src.getOperand(0);
    2017       27456 :     if (ParentNode->getValueType(0).isVector())
    2018             :       return false;
    2019             : 
    2020             :     // Gather constants values
    2021             :     int SrcIndices[] = {
    2022        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2023        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2024        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
    2025        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2026        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2027        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2028        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2029        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2030        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2031        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2032        9152 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2033      100672 :     };
    2034             :     std::vector<unsigned> Consts;
    2035      210496 :     for (int OtherSrcIdx : SrcIndices) {
    2036      100672 :       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
    2037      100672 :       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
    2038       82136 :         continue;
    2039       18536 :       if (HasDst) {
    2040       18536 :         OtherSrcIdx--;
    2041       18536 :         OtherSelIdx--;
    2042             :       }
    2043             :       if (RegisterSDNode *Reg =
    2044       18536 :           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
    2045         725 :         if (Reg->getReg() == AMDGPU::ALU_CONST) {
    2046             :           ConstantSDNode *Cst
    2047         557 :             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
    2048        1671 :           Consts.push_back(Cst->getZExtValue());
    2049             :         }
    2050             :       }
    2051             :     }
    2052             : 
    2053             :     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
    2054       27456 :     Consts.push_back(Cst->getZExtValue());
    2055        9152 :     if (!TII->fitsConstReadLimitations(Consts)) {
    2056             :       return false;
    2057             :     }
    2058             : 
    2059        9132 :     Sel = CstOffset;
    2060        9132 :     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
    2061        9132 :     return true;
    2062             :   }
    2063             :   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
    2064             :     // Check if the Imm slot is used. Taken from below.
    2065          28 :     if (cast<ConstantSDNode>(Imm)->getZExtValue())
    2066             :       return false;
    2067          14 :     Imm = Src.getOperand(0);
    2068          14 :     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
    2069          14 :     return true;
    2070       27121 :   case AMDGPU::MOV_IMM_I32:
    2071             :   case AMDGPU::MOV_IMM_F32: {
    2072             :     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
    2073             :     uint64_t ImmValue = 0;
    2074             : 
    2075       27121 :     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
    2076             :       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
    2077         992 :       float FloatValue = FPC->getValueAPF().convertToFloat();
    2078         496 :       if (FloatValue == 0.0) {
    2079             :         ImmReg = AMDGPU::ZERO;
    2080         354 :       } else if (FloatValue == 0.5) {
    2081             :         ImmReg = AMDGPU::HALF;
    2082         319 :       } else if (FloatValue == 1.0) {
    2083             :         ImmReg = AMDGPU::ONE;
    2084             :       } else {
    2085         750 :         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
    2086             :       }
    2087             :     } else {
    2088             :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
    2089       26625 :       uint64_t Value = C->getZExtValue();
    2090       26625 :       if (Value == 0) {
    2091             :         ImmReg = AMDGPU::ZERO;
    2092       24972 :       } else if (Value == 1) {
    2093             :         ImmReg = AMDGPU::ONE_INT;
    2094             :       } else {
    2095             :         ImmValue = Value;
    2096             :       }
    2097             :     }
    2098             : 
    2099             :     // Check that we aren't already using an immediate.
    2100             :     // XXX: It's possible for an instruction to have more than one
    2101             :     // immediate operand, but this is not supported yet.
    2102             :     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
    2103       22348 :       if (!Imm.getNode())
    2104             :         return false;
    2105             :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
    2106             :       assert(C);
    2107       44588 :       if (C->getZExtValue())
    2108             :         return false;
    2109       40100 :       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
    2110             :     }
    2111       24823 :     Src = DAG.getRegister(ImmReg, MVT::i32);
    2112       24823 :     return true;
    2113             :   }
    2114             :   default:
    2115             :     return false;
    2116             :   }
    2117             : }
    2118             : 
    2119             : /// \brief Fold the instructions after selecting them
    2120      160148 : SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
    2121             :                                             SelectionDAG &DAG) const {
    2122      160148 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    2123      160148 :   if (!Node->isMachineOpcode())
    2124             :     return Node;
    2125             : 
    2126             :   unsigned Opcode = Node->getMachineOpcode();
    2127      160148 :   SDValue FakeOp;
    2128             : 
    2129      160148 :   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
    2130             : 
    2131      160148 :   if (Opcode == AMDGPU::DOT_4) {
    2132             :     int OperandIdx[] = {
    2133         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2134         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2135         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2136         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2137         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2138         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2139         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2140         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2141        1176 :         };
    2142             :     int NegIdx[] = {
    2143         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
    2144         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
    2145         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
    2146         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
    2147         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
    2148         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
    2149         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
    2150         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
    2151        1176 :     };
    2152             :     int AbsIdx[] = {
    2153         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
    2154         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
    2155         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
    2156         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
    2157         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
    2158         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
    2159         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
    2160         147 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
    2161        1176 :     };
    2162        1975 :     for (unsigned i = 0; i < 8; i++) {
    2163         994 :       if (OperandIdx[i] < 0)
    2164          80 :         return Node;
    2165         994 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2166         994 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2167         994 :       SDValue &Abs = Ops[AbsIdx[i] - 1];
    2168         994 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2169         994 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2170         994 :       if (HasDst)
    2171         994 :         SelIdx--;
    2172         994 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2173         994 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
    2174         240 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2175             :     }
    2176      160001 :   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
    2177       49456 :     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
    2178       20166 :       SDValue &Src = Ops[i];
    2179       20166 :       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
    2180        3819 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2181             :     }
    2182      154166 :   } else if (Opcode == AMDGPU::CLAMP_R600) {
    2183           0 :     SDValue Src = Node->getOperand(0);
    2184           0 :     if (!Src.isMachineOpcode() ||
    2185           0 :         !TII->hasInstrModifiers(Src.getMachineOpcode()))
    2186             :       return Node;
    2187             :     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
    2188           0 :         AMDGPU::OpName::clamp);
    2189           0 :     if (ClampIdx < 0)
    2190             :       return Node;
    2191             :     SDLoc DL(Node);
    2192           0 :     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
    2193           0 :     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
    2194           0 :     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
    2195           0 :                               Node->getVTList(), Ops);
    2196             :   } else {
    2197      154166 :     if (!TII->hasInstrModifiers(Opcode))
    2198      139891 :       return Node;
    2199             :     int OperandIdx[] = {
    2200      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2201      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2202      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
    2203      314694 :     };
    2204             :     int NegIdx[] = {
    2205      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
    2206      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
    2207      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
    2208      314694 :     };
    2209      104898 :     int AbsIdx[] = {
    2210      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
    2211      104898 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
    2212             :       -1
    2213      209796 :     };
    2214      473356 :     for (unsigned i = 0; i < 3; i++) {
    2215      274852 :       if (OperandIdx[i] < 0)
    2216       90623 :         return Node;
    2217      217031 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2218      217031 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2219      217031 :       SDValue FakeAbs;
    2220      217031 :       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
    2221      217031 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2222      217031 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2223      217031 :       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
    2224      217031 :       if (HasDst) {
    2225      217031 :         SelIdx--;
    2226      217031 :         ImmIdx--;
    2227             :       }
    2228      217031 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2229      217031 :       SDValue &Imm = Ops[ImmIdx];
    2230      217031 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
    2231       98406 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2232             :     }
    2233             :   }
    2234             : 
    2235             :   return Node;
    2236             : }

Generated by: LCOV version 1.13