LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - R600ISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 859 927 92.7 %
Date: 2018-06-17 00:07:59 Functions: 37 39 94.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Custom DAG lowering for R600
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "R600ISelLowering.h"
      16             : #include "AMDGPUFrameLowering.h"
      17             : #include "AMDGPUIntrinsicInfo.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "R600Defines.h"
      20             : #include "R600FrameLowering.h"
      21             : #include "R600InstrInfo.h"
      22             : #include "R600MachineFunctionInfo.h"
      23             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      24             : #include "Utils/AMDGPUBaseInfo.h"
      25             : #include "llvm/ADT/APFloat.h"
      26             : #include "llvm/ADT/APInt.h"
      27             : #include "llvm/ADT/ArrayRef.h"
      28             : #include "llvm/ADT/DenseMap.h"
      29             : #include "llvm/ADT/SmallVector.h"
      30             : #include "llvm/CodeGen/CallingConvLower.h"
      31             : #include "llvm/CodeGen/DAGCombine.h"
      32             : #include "llvm/CodeGen/ISDOpcodes.h"
      33             : #include "llvm/CodeGen/MachineBasicBlock.h"
      34             : #include "llvm/CodeGen/MachineFunction.h"
      35             : #include "llvm/CodeGen/MachineInstr.h"
      36             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      39             : #include "llvm/CodeGen/SelectionDAG.h"
      40             : #include "llvm/IR/Constants.h"
      41             : #include "llvm/IR/DerivedTypes.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include "llvm/Support/Compiler.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include "llvm/Support/MachineValueType.h"
      46             : #include <cassert>
      47             : #include <cstdint>
      48             : #include <iterator>
      49             : #include <utility>
      50             : #include <vector>
      51             : 
      52             : using namespace llvm;
      53             : 
      54         286 : R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
      55         286 :                                        const R600Subtarget &STI)
      56         286 :     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
      57             :   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
      58             :   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
      59             :   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
      60             :   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
      61             :   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
      62             :   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
      63             : 
      64         572 :   computeRegisterProperties(STI.getRegisterInfo());
      65             : 
      66             :   // Legalize loads and stores to the private address space.
      67             :   setOperationAction(ISD::LOAD, MVT::i32, Custom);
      68             :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
      69             :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
      70             : 
      71             :   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
      72             :   // spaces, so it is custom lowered to handle those where it isn't.
      73        2002 :   for (MVT VT : MVT::integer_valuetypes()) {
      74             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
      75             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
      76             :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
      77             : 
      78             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
      79             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
      80             :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
      81             : 
      82             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
      83             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
      84             :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
      85             :   }
      86             : 
      87             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
      88             :   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      89             :   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      90             :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      91             : 
      92             :   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      93             :   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      94             :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      95             : 
      96             :   setOperationAction(ISD::STORE, MVT::i8, Custom);
      97             :   setOperationAction(ISD::STORE, MVT::i32, Custom);
      98             :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
      99             :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     100             : 
     101             :   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
     102             :   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
     103             :   // We need to include these since trunc STORES to PRIVATE need
     104             :   // special handling to accommodate RMW
     105             :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
     106             :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
     107             :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
     108             :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
     109             :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
     110             :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
     111             :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
     112             :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
     113             :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
     114             :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
     115             : 
     116             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
     117             :   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
     118             :   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
     119             : 
     120             :   // Set condition code actions
     121             :   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     122             :   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     123             :   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     124             :   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     125             :   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     126             :   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     127             :   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     128             :   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     129             :   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     130             :   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     131             :   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     132             :   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     133             : 
     134             :   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     135             :   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     136             :   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     137             :   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     138             : 
     139             :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     140             :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     141             : 
     142             :   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     143             :   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     144             : 
     145             :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     146             :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     147             :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     148             : 
     149             :   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     150             : 
     151             :   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     152             :   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     153             : 
     154             :   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     155             :   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     156             :   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     157             :   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
     158             :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     159             :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     160             : 
     161             :   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     162             :   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     163             :   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     164             :   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     165             : 
     166             :   // ADD, SUB overflow.
     167             :   // TODO: turn these into Legal?
     168         572 :   if (Subtarget->hasCARRY())
     169             :     setOperationAction(ISD::UADDO, MVT::i32, Custom);
     170             : 
     171         286 :   if (Subtarget->hasBORROW())
     172             :     setOperationAction(ISD::USUBO, MVT::i32, Custom);
     173             : 
     174             :   // Expand sign extension of vectors
     175         286 :   if (!Subtarget->hasBFE())
     176             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
     177             : 
     178             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
     179             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
     180             : 
     181         286 :   if (!Subtarget->hasBFE())
     182             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
     183             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
     184             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
     185             : 
     186         286 :   if (!Subtarget->hasBFE())
     187             :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     188             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
     189             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
     190             : 
     191             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
     192             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
     193             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
     194             : 
     195             :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
     196             : 
     197             :   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
     198             : 
     199             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
     200             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
     201             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     202             :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     203             : 
     204             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
     205             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
     206             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
     207             :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     208             : 
     209             :   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
     210             :   //  to be Legal/Custom in order to avoid library calls.
     211             :   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
     212             :   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
     213             :   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
     214             : 
     215         286 :   if (!Subtarget->hasFMA()) {
     216             :     setOperationAction(ISD::FMA, MVT::f32, Expand);
     217             :     setOperationAction(ISD::FMA, MVT::f64, Expand);
     218             :   }
     219             : 
     220             :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     221             : 
     222         286 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     223        1430 :   for (MVT VT : ScalarIntVTs) {
     224             :     setOperationAction(ISD::ADDC, VT, Expand);
     225             :     setOperationAction(ISD::SUBC, VT, Expand);
     226             :     setOperationAction(ISD::ADDE, VT, Expand);
     227             :     setOperationAction(ISD::SUBE, VT, Expand);
     228             :   }
     229             : 
     230             :   // LLVM will expand these to atomic_cmp_swap(0)
     231             :   // and atomic_swap, respectively.
     232             :   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
     233             :   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
     234             : 
     235             :   // We need to custom lower some of the intrinsics
     236             :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     237             :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     238             : 
     239             :   setSchedulingPreference(Sched::Source);
     240             : 
     241             :   setTargetDAGCombine(ISD::FP_ROUND);
     242             :   setTargetDAGCombine(ISD::FP_TO_SINT);
     243             :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     244             :   setTargetDAGCombine(ISD::SELECT_CC);
     245             :   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     246             :   setTargetDAGCombine(ISD::LOAD);
     247         286 : }
     248             : 
     249      413241 : const R600Subtarget *R600TargetLowering::getSubtarget() const {
     250      413241 :   return static_cast<const R600Subtarget *>(Subtarget);
     251             : }
     252             : 
     253        2440 : static inline bool isEOP(MachineBasicBlock::iterator I) {
     254        4880 :   if (std::next(I) == I->getParent()->end())
     255             :     return false;
     256        4864 :   return std::next(I)->getOpcode() == AMDGPU::RETURN;
     257             : }
     258             : 
     259             : MachineBasicBlock *
     260        9063 : R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     261             :                                                 MachineBasicBlock *BB) const {
     262        9063 :   MachineFunction *MF = BB->getParent();
     263        9063 :   MachineRegisterInfo &MRI = MF->getRegInfo();
     264             :   MachineBasicBlock::iterator I = MI;
     265        9063 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
     266             : 
     267       18126 :   switch (MI.getOpcode()) {
     268         882 :   default:
     269             :     // Replace LDS_*_RET instruction that don't have any uses with the
     270             :     // equivalent LDS_*_NORET instruction.
     271         882 :     if (TII->isLDSRetInstr(MI.getOpcode())) {
     272        1764 :       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
     273             :       assert(DstIdx != -1);
     274             :       MachineInstrBuilder NewMI;
     275             :       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
     276             :       //        LDS_1A2D support and remove this special case.
     277        2676 :       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
     278          30 :           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
     279             :         return BB;
     280             : 
     281          60 :       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
     282          30 :                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
     283         300 :       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
     284         270 :         NewMI.add(MI.getOperand(i));
     285             :       }
     286             :     } else {
     287           0 :       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     288             :     }
     289             :     break;
     290             : 
     291          20 :   case AMDGPU::FABS_R600: {
     292          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     293             :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     294          40 :         MI.getOperand(1).getReg());
     295          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     296          20 :     break;
     297             :   }
     298             : 
     299          20 :   case AMDGPU::FNEG_R600: {
     300          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     301             :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     302          40 :         MI.getOperand(1).getReg());
     303          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     304          20 :     break;
     305             :   }
     306             : 
     307           0 :   case AMDGPU::MASK_WRITE: {
     308           0 :     unsigned maskedRegister = MI.getOperand(0).getReg();
     309             :     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     310           0 :     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     311           0 :     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     312           0 :     break;
     313             :   }
     314             : 
     315          17 :   case AMDGPU::MOV_IMM_F32:
     316          34 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
     317             :                                                             .getFPImm()
     318          17 :                                                             ->getValueAPF()
     319          34 :                                                             .bitcastToAPInt()
     320             :                                                             .getZExtValue());
     321          17 :     break;
     322             : 
     323         513 :   case AMDGPU::MOV_IMM_I32:
     324         513 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
     325         513 :                      MI.getOperand(1).getImm());
     326         513 :     break;
     327             : 
     328           1 :   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
     329             :     //TODO: Perhaps combine this instruction with the next if possible
     330             :     auto MIB = TII->buildDefaultInstruction(
     331           2 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
     332           1 :     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
     333             :     //TODO: Ugh this is rather ugly
     334           2 :     MIB->getOperand(Idx) = MI.getOperand(1);
     335             :     break;
     336             :   }
     337             : 
     338        2720 :   case AMDGPU::CONST_COPY: {
     339        5440 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     340        8160 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
     341        2720 :     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
     342        2720 :                        MI.getOperand(1).getImm());
     343        2720 :     break;
     344             :   }
     345             : 
     346        2378 :   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
     347             :   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
     348             :   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
     349        7134 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     350        2378 :         .add(MI.getOperand(0))
     351        2378 :         .add(MI.getOperand(1))
     352        2378 :         .addImm(isEOP(I)); // Set End of program bit
     353        2378 :     break;
     354             : 
     355           2 :   case AMDGPU::RAT_STORE_TYPED_eg:
     356           6 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     357           2 :         .add(MI.getOperand(0))
     358           2 :         .add(MI.getOperand(1))
     359           2 :         .add(MI.getOperand(2))
     360           2 :         .addImm(isEOP(I)); // Set End of program bit
     361           2 :     break;
     362             : 
     363         135 :   case AMDGPU::BRANCH:
     364         540 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
     365         135 :         .add(MI.getOperand(0));
     366         135 :     break;
     367             : 
     368           0 :   case AMDGPU::BRANCH_COND_f32: {
     369             :     MachineInstr *NewMI =
     370           0 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     371           0 :                 AMDGPU::PREDICATE_BIT)
     372           0 :             .add(MI.getOperand(1))
     373             :             .addImm(AMDGPU::PRED_SETNE)
     374             :             .addImm(0); // Flags
     375           0 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     376           0 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     377           0 :         .add(MI.getOperand(0))
     378           0 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     379           0 :     break;
     380             :   }
     381             : 
     382          86 :   case AMDGPU::BRANCH_COND_i32: {
     383             :     MachineInstr *NewMI =
     384         172 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     385          86 :                 AMDGPU::PREDICATE_BIT)
     386          86 :             .add(MI.getOperand(1))
     387             :             .addImm(AMDGPU::PRED_SETNE_INT)
     388             :             .addImm(0); // Flags
     389          86 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     390         344 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     391          86 :         .add(MI.getOperand(0))
     392          86 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     393          86 :     break;
     394             :   }
     395             : 
     396          60 :   case AMDGPU::EG_ExportSwz:
     397             :   case AMDGPU::R600_ExportSwz: {
     398             :     // Instruction is left unmodified if its not the last one of its type
     399             :     bool isLastInstructionOfItsType = true;
     400          60 :     unsigned InstExportType = MI.getOperand(1).getImm();
     401         129 :     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
     402         189 :          EndBlock = BB->end(); NextExportInst != EndBlock;
     403             :          NextExportInst = std::next(NextExportInst)) {
     404         276 :       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
     405             :           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
     406          23 :         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
     407          23 :             .getImm();
     408          23 :         if (CurrentInstExportType == InstExportType) {
     409             :           isLastInstructionOfItsType = false;
     410             :           break;
     411             :         }
     412             :       }
     413             :     }
     414          60 :     bool EOP = isEOP(I);
     415          60 :     if (!EOP && !isLastInstructionOfItsType)
     416             :       return BB;
     417         102 :     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
     418         153 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     419          51 :         .add(MI.getOperand(0))
     420          51 :         .add(MI.getOperand(1))
     421          51 :         .add(MI.getOperand(2))
     422          51 :         .add(MI.getOperand(3))
     423          51 :         .add(MI.getOperand(4))
     424          51 :         .add(MI.getOperand(5))
     425          51 :         .add(MI.getOperand(6))
     426          51 :         .addImm(CfInst)
     427          51 :         .addImm(EOP);
     428          51 :     break;
     429             :   }
     430             :   case AMDGPU::RETURN: {
     431             :     return BB;
     432             :   }
     433             :   }
     434             : 
     435        5973 :   MI.eraseFromParent();
     436        5973 :   return BB;
     437             : }
     438             : 
     439             : //===----------------------------------------------------------------------===//
     440             : // Custom DAG Lowering Operations
     441             : //===----------------------------------------------------------------------===//
     442             : 
     443      103563 : SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     444      103563 :   MachineFunction &MF = DAG.getMachineFunction();
     445      103563 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     446      103563 :   switch (Op.getOpcode()) {
     447         450 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     448       11046 :   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
     449           7 :   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
     450          50 :   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
     451          28 :   case ISD::SRA_PARTS:
     452          28 :   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
     453          64 :   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
     454         620 :   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
     455          17 :   case ISD::FCOS:
     456          17 :   case ISD::FSIN: return LowerTrig(Op, DAG);
     457       16360 :   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
     458       33297 :   case ISD::STORE: return LowerSTORE(Op, DAG);
     459       39125 :   case ISD::LOAD: {
     460       39125 :     SDValue Result = LowerLOAD(Op, DAG);
     461             :     assert((!Result.getNode() ||
     462             :             Result.getNode()->getNumValues() == 2) &&
     463             :            "Load should return a value and a chain");
     464       39125 :     return Result;
     465             :   }
     466             : 
     467          86 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
     468          57 :   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
     469        1606 :   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
     470          82 :   case ISD::INTRINSIC_VOID: {
     471          82 :     SDValue Chain = Op.getOperand(0);
     472             :     unsigned IntrinsicID =
     473         164 :                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     474          82 :     switch (IntrinsicID) {
     475          60 :     case Intrinsic::r600_store_swizzle: {
     476             :       SDLoc DL(Op);
     477             :       const SDValue Args[8] = {
     478             :         Chain,
     479             :         Op.getOperand(2), // Export Value
     480             :         Op.getOperand(3), // ArrayBase
     481             :         Op.getOperand(4), // Type
     482          60 :         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
     483          60 :         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
     484          60 :         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
     485          60 :         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
     486         360 :       };
     487          60 :       return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
     488             :     }
     489             : 
     490             :     // default for switch(IntrinsicID)
     491             :     default: break;
     492             :     }
     493             :     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
     494             :     break;
     495             :   }
     496         668 :   case ISD::INTRINSIC_WO_CHAIN: {
     497             :     unsigned IntrinsicID =
     498        1336 :                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     499         668 :     EVT VT = Op.getValueType();
     500             :     SDLoc DL(Op);
     501         668 :     switch (IntrinsicID) {
     502         276 :     case Intrinsic::r600_tex:
     503             :     case Intrinsic::r600_texc: {
     504             :       unsigned TextureOp;
     505         276 :       switch (IntrinsicID) {
     506             :       case Intrinsic::r600_tex:
     507             :         TextureOp = 0;
     508             :         break;
     509           7 :       case Intrinsic::r600_texc:
     510             :         TextureOp = 1;
     511           7 :         break;
     512           0 :       default:
     513           0 :         llvm_unreachable("unhandled texture operation");
     514             :       }
     515             : 
     516             :       SDValue TexArgs[19] = {
     517         552 :         DAG.getConstant(TextureOp, DL, MVT::i32),
     518             :         Op.getOperand(1),
     519         276 :         DAG.getConstant(0, DL, MVT::i32),
     520         276 :         DAG.getConstant(1, DL, MVT::i32),
     521         276 :         DAG.getConstant(2, DL, MVT::i32),
     522         276 :         DAG.getConstant(3, DL, MVT::i32),
     523             :         Op.getOperand(2),
     524             :         Op.getOperand(3),
     525             :         Op.getOperand(4),
     526         276 :         DAG.getConstant(0, DL, MVT::i32),
     527         276 :         DAG.getConstant(1, DL, MVT::i32),
     528         276 :         DAG.getConstant(2, DL, MVT::i32),
     529         276 :         DAG.getConstant(3, DL, MVT::i32),
     530             :         Op.getOperand(5),
     531             :         Op.getOperand(6),
     532             :         Op.getOperand(7),
     533             :         Op.getOperand(8),
     534             :         Op.getOperand(9),
     535             :         Op.getOperand(10)
     536        3312 :       };
     537         276 :       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     538             :     }
     539          32 :     case Intrinsic::r600_dot4: {
     540             :       SDValue Args[8] = {
     541             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     542          64 :           DAG.getConstant(0, DL, MVT::i32)),
     543             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     544          64 :           DAG.getConstant(0, DL, MVT::i32)),
     545             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     546          64 :           DAG.getConstant(1, DL, MVT::i32)),
     547             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     548          64 :           DAG.getConstant(1, DL, MVT::i32)),
     549             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     550          64 :           DAG.getConstant(2, DL, MVT::i32)),
     551             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     552          64 :           DAG.getConstant(2, DL, MVT::i32)),
     553             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     554          64 :           DAG.getConstant(3, DL, MVT::i32)),
     555             :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     556          64 :           DAG.getConstant(3, DL, MVT::i32))
     557         256 :       };
     558          32 :       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     559             :     }
     560             : 
     561           2 :     case Intrinsic::r600_implicitarg_ptr: {
     562           2 :       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
     563           2 :       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
     564           2 :       return DAG.getConstant(ByteOffset, DL, PtrVT);
     565             :     }
     566           1 :     case Intrinsic::r600_read_ngroups_x:
     567           1 :       return LowerImplicitParameter(DAG, VT, DL, 0);
     568           1 :     case Intrinsic::r600_read_ngroups_y:
     569           1 :       return LowerImplicitParameter(DAG, VT, DL, 1);
     570           1 :     case Intrinsic::r600_read_ngroups_z:
     571           1 :       return LowerImplicitParameter(DAG, VT, DL, 2);
     572           2 :     case Intrinsic::r600_read_global_size_x:
     573           2 :       return LowerImplicitParameter(DAG, VT, DL, 3);
     574           2 :     case Intrinsic::r600_read_global_size_y:
     575           2 :       return LowerImplicitParameter(DAG, VT, DL, 4);
     576           2 :     case Intrinsic::r600_read_global_size_z:
     577           2 :       return LowerImplicitParameter(DAG, VT, DL, 5);
     578           8 :     case Intrinsic::r600_read_local_size_x:
     579           8 :       return LowerImplicitParameter(DAG, VT, DL, 6);
     580          36 :     case Intrinsic::r600_read_local_size_y:
     581          36 :       return LowerImplicitParameter(DAG, VT, DL, 7);
     582          36 :     case Intrinsic::r600_read_local_size_z:
     583          36 :       return LowerImplicitParameter(DAG, VT, DL, 8);
     584             : 
     585           4 :     case Intrinsic::r600_read_tgid_x:
     586             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     587           4 :                                      AMDGPU::T1_X, VT);
     588           3 :     case Intrinsic::r600_read_tgid_y:
     589             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     590           3 :                                      AMDGPU::T1_Y, VT);
     591           3 :     case Intrinsic::r600_read_tgid_z:
     592             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     593           3 :                                      AMDGPU::T1_Z, VT);
     594         183 :     case Intrinsic::r600_read_tidig_x:
     595             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     596         183 :                                      AMDGPU::T0_X, VT);
     597          32 :     case Intrinsic::r600_read_tidig_y:
     598             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     599          32 :                                      AMDGPU::T0_Y, VT);
     600          32 :     case Intrinsic::r600_read_tidig_z:
     601             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     602          32 :                                      AMDGPU::T0_Z, VT);
     603             : 
     604             :     case Intrinsic::r600_recipsqrt_ieee:
     605           3 :       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
     606             : 
     607             :     case Intrinsic::r600_recipsqrt_clamped:
     608           5 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     609           4 :     default:
     610           4 :       return Op;
     611             :     }
     612             : 
     613             :     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     614             :     break;
     615             :   }
     616             :   } // end switch(Op.getOpcode())
     617          22 :   return SDValue();
     618             : }
     619             : 
     620         105 : void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     621             :                                             SmallVectorImpl<SDValue> &Results,
     622             :                                             SelectionDAG &DAG) const {
     623         210 :   switch (N->getOpcode()) {
     624          53 :   default:
     625          53 :     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     626          53 :     return;
     627             :   case ISD::FP_TO_UINT:
     628           9 :     if (N->getValueType(0) == MVT::i1) {
     629           4 :       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
     630           2 :       return;
     631             :     }
     632             :     // Since we don't care about out of bounds values we can use FP_TO_SINT for
     633             :     // uints too. The DAGLegalizer code for uint considers some extra cases
     634             :     // which are not necessary here.
     635             :     LLVM_FALLTHROUGH;
     636             :   case ISD::FP_TO_SINT: {
     637          16 :     if (N->getValueType(0) == MVT::i1) {
     638           4 :       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
     639           2 :       return;
     640             :     }
     641             : 
     642          14 :     SDValue Result;
     643          14 :     if (expandFP_TO_SINT(N, Result, DAG))
     644          14 :       Results.push_back(Result);
     645             :     return;
     646             :   }
     647             :   case ISD::SDIVREM: {
     648             :     SDValue Op = SDValue(N, 1);
     649          12 :     SDValue RES = LowerSDIVREM(Op, DAG);
     650          12 :     Results.push_back(RES);
     651          12 :     Results.push_back(RES.getValue(1));
     652             :     break;
     653             :   }
     654             :   case ISD::UDIVREM: {
     655             :     SDValue Op = SDValue(N, 0);
     656          22 :     LowerUDIVREM64(Op, DAG, Results);
     657             :     break;
     658             :   }
     659             :   }
     660             : }
     661             : 
     662          16 : SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
     663             :                                                    SDValue Vector) const {
     664             :   SDLoc DL(Vector);
     665          16 :   EVT VecVT = Vector.getValueType();
     666          16 :   EVT EltVT = VecVT.getVectorElementType();
     667             :   SmallVector<SDValue, 8> Args;
     668             : 
     669          64 :   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
     670          48 :     Args.push_back(DAG.getNode(
     671             :         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
     672         192 :         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
     673             :   }
     674             : 
     675          32 :   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
     676             : }
     677             : 
     678       11046 : SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     679             :                                                     SelectionDAG &DAG) const {
     680             :   SDLoc DL(Op);
     681       11046 :   SDValue Vector = Op.getOperand(0);
     682       11046 :   SDValue Index = Op.getOperand(1);
     683             : 
     684          42 :   if (isa<ConstantSDNode>(Index) ||
     685             :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     686       11032 :     return Op;
     687             : 
     688          14 :   Vector = vectorToVerticalVector(DAG, Vector);
     689             :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
     690          14 :                      Vector, Index);
     691             : }
     692             : 
     693           7 : SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     694             :                                                    SelectionDAG &DAG) const {
     695             :   SDLoc DL(Op);
     696           7 :   SDValue Vector = Op.getOperand(0);
     697           7 :   SDValue Value = Op.getOperand(1);
     698           7 :   SDValue Index = Op.getOperand(2);
     699             : 
     700           3 :   if (isa<ConstantSDNode>(Index) ||
     701             :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     702           6 :     return Op;
     703             : 
     704           1 :   Vector = vectorToVerticalVector(DAG, Vector);
     705             :   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
     706           1 :                                Vector, Value, Index);
     707           1 :   return vectorToVerticalVector(DAG, Insert);
     708             : }
     709             : 
     710          57 : SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
     711             :                                                SDValue Op,
     712             :                                                SelectionDAG &DAG) const {
     713             :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
     714          57 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     715          42 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
     716             : 
     717          15 :   const DataLayout &DL = DAG.getDataLayout();
     718          15 :   const GlobalValue *GV = GSD->getGlobal();
     719             :   MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
     720             : 
     721          30 :   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
     722          30 :   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
     723             : }
     724             : 
     725          17 : SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
     726             :   // On hw >= R700, COS/SIN input must be between -1. and 1.
     727             :   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
     728          17 :   EVT VT = Op.getValueType();
     729          17 :   SDValue Arg = Op.getOperand(0);
     730             :   SDLoc DL(Op);
     731             : 
     732             :   // TODO: Should this propagate fast-math-flags?
     733             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
     734             :       DAG.getNode(ISD::FADD, DL, VT,
     735             :         DAG.getNode(ISD::FMUL, DL, VT, Arg,
     736             :           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
     737          34 :         DAG.getConstantFP(0.5, DL, MVT::f32)));
     738             :   unsigned TrigNode;
     739          17 :   switch (Op.getOpcode()) {
     740             :   case ISD::FCOS:
     741             :     TrigNode = AMDGPUISD::COS_HW;
     742             :     break;
     743          11 :   case ISD::FSIN:
     744             :     TrigNode = AMDGPUISD::SIN_HW;
     745          11 :     break;
     746           0 :   default:
     747           0 :     llvm_unreachable("Wrong trig opcode");
     748             :   }
     749             :   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
     750             :       DAG.getNode(ISD::FADD, DL, VT, FractPart,
     751          17 :         DAG.getConstantFP(-0.5, DL, MVT::f32)));
     752          17 :   if (Gen >= R600Subtarget::R700)
     753          17 :     return TrigVal;
     754             :   // On R600 hw, COS/SIN input must be between -Pi and Pi.
     755             :   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
     756           0 :       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
     757             : }
     758             : 
     759          50 : SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
     760             :   SDLoc DL(Op);
     761          50 :   EVT VT = Op.getValueType();
     762             : 
     763          50 :   SDValue Lo = Op.getOperand(0);
     764          50 :   SDValue Hi = Op.getOperand(1);
     765          50 :   SDValue Shift = Op.getOperand(2);
     766          50 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     767          50 :   SDValue One  = DAG.getConstant(1, DL, VT);
     768             : 
     769          50 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     770          50 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     771          50 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     772          50 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     773             : 
     774             :   // The dance around Width1 is necessary for 0 special case.
     775             :   // Without it the CompShift might be 32, producing incorrect results in
     776             :   // Overflow. So we do the shift in two steps, the alternative is to
     777             :   // add a conditional to filter the special case.
     778             : 
     779          50 :   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
     780          50 :   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
     781             : 
     782          50 :   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
     783          50 :   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
     784          50 :   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
     785             : 
     786          50 :   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
     787          50 :   SDValue LoBig = Zero;
     788             : 
     789          50 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     790          50 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     791             : 
     792         100 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     793             : }
     794             : 
     795          28 : SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
     796             :   SDLoc DL(Op);
     797          28 :   EVT VT = Op.getValueType();
     798             : 
     799          28 :   SDValue Lo = Op.getOperand(0);
     800          28 :   SDValue Hi = Op.getOperand(1);
     801          28 :   SDValue Shift = Op.getOperand(2);
     802          28 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     803          28 :   SDValue One  = DAG.getConstant(1, DL, VT);
     804             : 
     805             :   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
     806             : 
     807          28 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     808          28 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     809          28 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     810          28 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     811             : 
     812             :   // The dance around Width1 is necessary for 0 special case.
     813             :   // Without it the CompShift might be 32, producing incorrect results in
     814             :   // Overflow. So we do the shift in two steps, the alternative is to
     815             :   // add a conditional to filter the special case.
     816             : 
     817          28 :   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
     818          28 :   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
     819             : 
     820          28 :   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
     821          28 :   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
     822          28 :   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
     823             : 
     824          28 :   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
     825          35 :   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
     826             : 
     827          28 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     828          28 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     829             : 
     830          56 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     831             : }
     832             : 
     833         684 : SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
     834             :                                           unsigned mainop, unsigned ovf) const {
     835             :   SDLoc DL(Op);
     836         684 :   EVT VT = Op.getValueType();
     837             : 
     838         684 :   SDValue Lo = Op.getOperand(0);
     839         684 :   SDValue Hi = Op.getOperand(1);
     840             : 
     841         684 :   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
     842             :   // Extend sign.
     843         684 :   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
     844        1368 :                     DAG.getValueType(MVT::i1));
     845             : 
     846         684 :   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
     847             : 
     848        1368 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
     849             : }
     850             : 
     851           2 : SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
     852             :   SDLoc DL(Op);
     853             :   return DAG.getNode(
     854             :       ISD::SETCC,
     855             :       DL,
     856             :       MVT::i1,
     857             :       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
     858           8 :       DAG.getCondCode(ISD::SETEQ));
     859             : }
     860             : 
     861           2 : SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
     862             :   SDLoc DL(Op);
     863             :   return DAG.getNode(
     864             :       ISD::SETCC,
     865             :       DL,
     866             :       MVT::i1,
     867             :       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
     868           8 :       DAG.getCondCode(ISD::SETEQ));
     869             : }
     870             : 
     871          89 : SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
     872             :                                                    const SDLoc &DL,
     873             :                                                    unsigned DwordOffset) const {
     874          89 :   unsigned ByteOffset = DwordOffset * 4;
     875          89 :   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
     876          89 :                                       AMDGPUASI.CONSTANT_BUFFER_0);
     877             : 
     878             :   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
     879             :   assert(isInt<16>(ByteOffset));
     880             : 
     881             :   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
     882             :                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
     883         267 :                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
     884             : }
     885             : 
     886       21260 : bool R600TargetLowering::isZero(SDValue Op) const {
     887             :   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     888        9772 :     return Cst->isNullValue();
     889             :   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
     890         440 :     return CstFP->isZero();
     891             :   } else {
     892             :     return false;
     893             :   }
     894             : }
     895             : 
     896       32690 : bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
     897             :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     898         936 :     return CFP->isExactlyValue(1.0);
     899             :   }
     900       32222 :   return isAllOnesConstant(Op);
     901             : }
     902             : 
     903        5894 : bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
     904             :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     905         310 :     return CFP->getValueAPF().isZero();
     906             :   }
     907        5739 :   return isNullConstant(Op);
     908             : }
     909             : 
     910       16360 : SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     911             :   SDLoc DL(Op);
     912       16360 :   EVT VT = Op.getValueType();
     913             : 
     914       16360 :   SDValue LHS = Op.getOperand(0);
     915       16360 :   SDValue RHS = Op.getOperand(1);
     916       16360 :   SDValue True = Op.getOperand(2);
     917       16360 :   SDValue False = Op.getOperand(3);
     918       16360 :   SDValue CC = Op.getOperand(4);
     919             :   SDValue Temp;
     920             : 
     921             :   if (VT == MVT::f32) {
     922             :     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
     923         379 :     SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     924         379 :     if (MinMax)
     925          19 :       return MinMax;
     926             :   }
     927             : 
     928             :   // LHS and RHS are guaranteed to be the same value type
     929       16341 :   EVT CompareVT = LHS.getValueType();
     930             : 
     931             :   // Check if we can lower this to a native operation.
     932             : 
     933             :   // Try to lower to a SET* instruction:
     934             :   //
     935             :   // SET* can match the following patterns:
     936             :   //
     937             :   // select_cc f32, f32, -1,  0, cc_supported
     938             :   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
     939             :   // select_cc i32, i32, -1,  0, cc_supported
     940             :   //
     941             : 
     942             :   // Move hardware True/False values to the correct operand.
     943       16341 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     944             :   ISD::CondCode InverseCC =
     945       16341 :      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
     946       16341 :   if (isHWTrueValue(False) && isHWFalseValue(True)) {
     947          82 :     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
     948             :       std::swap(False, True);
     949          11 :       CC = DAG.getCondCode(InverseCC);
     950             :     } else {
     951          71 :       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
     952          71 :       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
     953             :         std::swap(False, True);
     954             :         std::swap(LHS, RHS);
     955          60 :         CC = DAG.getCondCode(SwapInvCC);
     956             :       }
     957             :     }
     958             :   }
     959             : 
     960       16341 :   if (isHWTrueValue(True) && isHWFalseValue(False) &&
     961             :       (CompareVT == VT || VT == MVT::i32)) {
     962             :     // This can be matched by a SET* instruction.
     963        5711 :     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
     964             :   }
     965             : 
     966             :   // Try to lower to a CND* instruction:
     967             :   //
     968             :   // CND* can match the following patterns:
     969             :   //
     970             :   // select_cc f32, 0.0, f32, f32, cc_supported
     971             :   // select_cc f32, 0.0, i32, i32, cc_supported
     972             :   // select_cc i32, 0,   f32, f32, cc_supported
     973             :   // select_cc i32, 0,   i32, i32, cc_supported
     974             :   //
     975             : 
     976             :   // Try to move the zero value to the RHS
     977       10630 :   if (isZero(LHS)) {
     978           3 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     979             :     // Try swapping the operands
     980           3 :     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
     981           3 :     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     982             :       std::swap(LHS, RHS);
     983           0 :       CC = DAG.getCondCode(CCSwapped);
     984             :     } else {
     985             :       // Try inverting the conditon and then swapping the operands
     986           3 :       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
     987           3 :       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
     988           3 :       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     989             :         std::swap(True, False);
     990             :         std::swap(LHS, RHS);
     991           0 :         CC = DAG.getCondCode(CCSwapped);
     992             :       }
     993             :     }
     994             :   }
     995       10630 :   if (isZero(RHS)) {
     996        9752 :     SDValue Cond = LHS;
     997        9752 :     SDValue Zero = RHS;
     998        9752 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     999        9752 :     if (CompareVT != VT) {
    1000             :       // Bitcast True / False to the correct types.  This will end up being
    1001             :       // a nop, but it allows us to define only a single pattern in the
    1002             :       // .TD files for each CND* instruction rather than having to have
    1003             :       // one pattern for integer True/False and one for fp True/False
    1004          55 :       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
    1005          55 :       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
    1006             :     }
    1007             : 
    1008             :     switch (CCOpcode) {
    1009             :     case ISD::SETONE:
    1010             :     case ISD::SETUNE:
    1011             :     case ISD::SETNE:
    1012        2149 :       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
    1013             :       Temp = True;
    1014             :       True = False;
    1015             :       False = Temp;
    1016        2149 :       break;
    1017             :     default:
    1018             :       break;
    1019             :     }
    1020             :     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
    1021             :         Cond, Zero,
    1022             :         True, False,
    1023        9752 :         DAG.getCondCode(CCOpcode));
    1024        9752 :     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
    1025             :   }
    1026             : 
    1027             :   // If we make it this for it means we have no native instructions to handle
    1028             :   // this SELECT_CC, so we must lower it.
    1029         878 :   SDValue HWTrue, HWFalse;
    1030             : 
    1031             :   if (CompareVT == MVT::f32) {
    1032          60 :     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
    1033          60 :     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
    1034             :   } else if (CompareVT == MVT::i32) {
    1035         818 :     HWTrue = DAG.getConstant(-1, DL, CompareVT);
    1036         818 :     HWFalse = DAG.getConstant(0, DL, CompareVT);
    1037             :   }
    1038             :   else {
    1039           0 :     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
    1040             :   }
    1041             : 
    1042             :   // Lower this unsupported SELECT_CC into a combination of two supported
    1043             :   // SELECT_CC operations.
    1044         878 :   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
    1045             : 
    1046             :   return DAG.getNode(ISD::SELECT_CC, DL, VT,
    1047             :       Cond, HWFalse,
    1048             :       True, False,
    1049         878 :       DAG.getCondCode(ISD::SETNE));
    1050             : }
    1051             : 
    1052             : /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
    1053             : /// convert these pointers to a register index.  Each register holds
    1054             : /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
    1055             : /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
    1056             : /// for indirect addressing.
    1057           0 : SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
    1058             :                                                unsigned StackWidth,
    1059             :                                                SelectionDAG &DAG) const {
    1060             :   unsigned SRLPad;
    1061           0 :   switch(StackWidth) {
    1062             :   case 1:
    1063             :     SRLPad = 2;
    1064             :     break;
    1065           0 :   case 2:
    1066             :     SRLPad = 3;
    1067           0 :     break;
    1068           0 :   case 4:
    1069             :     SRLPad = 4;
    1070           0 :     break;
    1071           0 :   default: llvm_unreachable("Invalid stack width");
    1072             :   }
    1073             : 
    1074             :   SDLoc DL(Ptr);
    1075             :   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
    1076           0 :                      DAG.getConstant(SRLPad, DL, MVT::i32));
    1077             : }
    1078             : 
    1079           0 : void R600TargetLowering::getStackAddress(unsigned StackWidth,
    1080             :                                          unsigned ElemIdx,
    1081             :                                          unsigned &Channel,
    1082             :                                          unsigned &PtrIncr) const {
    1083           0 :   switch (StackWidth) {
    1084           0 :   default:
    1085             :   case 1:
    1086           0 :     Channel = 0;
    1087           0 :     if (ElemIdx > 0) {
    1088           0 :       PtrIncr = 1;
    1089             :     } else {
    1090           0 :       PtrIncr = 0;
    1091             :     }
    1092             :     break;
    1093           0 :   case 2:
    1094           0 :     Channel = ElemIdx % 2;
    1095           0 :     if (ElemIdx == 2) {
    1096           0 :       PtrIncr = 1;
    1097             :     } else {
    1098           0 :       PtrIncr = 0;
    1099             :     }
    1100             :     break;
    1101           0 :   case 4:
    1102           0 :     Channel = ElemIdx;
    1103           0 :     PtrIncr = 0;
    1104           0 :     break;
    1105             :   }
    1106           0 : }
    1107             : 
    1108        1313 : SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
    1109             :                                                    SelectionDAG &DAG) const {
    1110             :   SDLoc DL(Store);
    1111             :   //TODO: Who creates the i8 stores?
    1112             :   assert(Store->isTruncatingStore()
    1113             :          || Store->getValue().getValueType() == MVT::i8);
    1114             :   assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
    1115             : 
    1116        1313 :   SDValue Mask;
    1117             :   if (Store->getMemoryVT() == MVT::i8) {
    1118             :     assert(Store->getAlignment() >= 1);
    1119         591 :     Mask = DAG.getConstant(0xff, DL, MVT::i32);
    1120             :   } else if (Store->getMemoryVT() == MVT::i16) {
    1121             :     assert(Store->getAlignment() >= 2);
    1122         722 :     Mask = DAG.getConstant(0xffff, DL, MVT::i32);
    1123             :   } else {
    1124           0 :     llvm_unreachable("Unsupported private trunc store");
    1125             :   }
    1126             : 
    1127        1313 :   SDValue OldChain = Store->getChain();
    1128        1313 :   bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
    1129             :   // Skip dummy
    1130        2626 :   SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
    1131        1313 :   SDValue BasePtr = Store->getBasePtr();
    1132        1313 :   SDValue Offset = Store->getOffset();
    1133        1313 :   EVT MemVT = Store->getMemoryVT();
    1134             : 
    1135        1313 :   SDValue LoadPtr = BasePtr;
    1136        1313 :   if (!Offset.isUndef()) {
    1137           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1138             :   }
    1139             : 
    1140             :   // Get dword location
    1141             :   // TODO: this should be eliminated by the future SHR ptr, 2
    1142             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1143        2626 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1144             : 
    1145             :   // Load dword
    1146             :   // TODO: can we be smarter about machine pointer info?
    1147        1313 :   MachinePointerInfo PtrInfo(UndefValue::get(
    1148        1313 :       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
    1149        1313 :   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
    1150             : 
    1151        1313 :   Chain = Dst.getValue(1);
    1152             : 
    1153             :   // Get offset in dword
    1154             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1155        2626 :                                 DAG.getConstant(0x3, DL, MVT::i32));
    1156             : 
    1157             :   // Convert byte offset to bit shift
    1158             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1159        2626 :                                  DAG.getConstant(3, DL, MVT::i32));
    1160             : 
    1161             :   // TODO: Contrary to the name of the functiom,
    1162             :   // it also handles sub i32 non-truncating stores (like i1)
    1163             :   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
    1164        1313 :                                   Store->getValue());
    1165             : 
    1166             :   // Mask the value to the right type
    1167        1313 :   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
    1168             : 
    1169             :   // Shift the value in place
    1170             :   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
    1171        1313 :                                      MaskedValue, ShiftAmt);
    1172             : 
    1173             :   // Shift the mask in place
    1174        1313 :   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
    1175             : 
    1176             :   // Invert the mask. NOTE: if we had native ROL instructions we could
    1177             :   // use inverted mask
    1178        1313 :   DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
    1179             : 
    1180             :   // Cleanup the target bits
    1181        1313 :   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
    1182             : 
    1183             :   // Add the new bits
    1184        1313 :   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
    1185             : 
    1186             :   // Store dword
    1187             :   // TODO: Can we be smarter about MachinePointerInfo?
    1188        1313 :   SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo);
    1189             : 
    1190             :   // If we are part of expanded vector, make our neighbors depend on this store
    1191        1313 :   if (VectorTrunc) {
    1192             :     // Make all other vector elements depend on this store
    1193         868 :     Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
    1194         868 :     DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
    1195             :   }
    1196        2626 :   return NewStore;
    1197             : }
    1198             : 
    1199       33297 : SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    1200             :   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
    1201             :   unsigned AS = StoreNode->getAddressSpace();
    1202             : 
    1203       33297 :   SDValue Chain = StoreNode->getChain();
    1204       33297 :   SDValue Ptr = StoreNode->getBasePtr();
    1205       33297 :   SDValue Value = StoreNode->getValue();
    1206             : 
    1207       33297 :   EVT VT = Value.getValueType();
    1208       33297 :   EVT MemVT = StoreNode->getMemoryVT();
    1209       33297 :   EVT PtrVT = Ptr.getValueType();
    1210             : 
    1211             :   SDLoc DL(Op);
    1212             : 
    1213             :   // Neither LOCAL nor PRIVATE can do vectors at the moment
    1214       57410 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1215             :       VT.isVector()) {
    1216        1243 :     if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1217             :          StoreNode->isTruncatingStore()) {
    1218             :       // Add an extra level of chain to isolate this vector
    1219         243 :       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
    1220             :       // TODO: can the chain be replaced without creating a new store?
    1221             :       SDValue NewStore = DAG.getTruncStore(
    1222         243 :           NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
    1223             :           MemVT, StoreNode->getAlignment(),
    1224         729 :           StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
    1225             :       StoreNode = cast<StoreSDNode>(NewStore);
    1226             :     }
    1227             : 
    1228         939 :     return scalarizeVectorStore(StoreNode, DAG);
    1229             :   }
    1230             : 
    1231       32358 :   unsigned Align = StoreNode->getAlignment();
    1232       32593 :   if (Align < MemVT.getStoreSize() &&
    1233         235 :       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
    1234          24 :     return expandUnalignedStore(StoreNode, DAG);
    1235             :   }
    1236             : 
    1237             :   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
    1238       32334 :                                   DAG.getConstant(2, DL, PtrVT));
    1239             : 
    1240       32334 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
    1241             :     // It is beneficial to create MSKOR here instead of combiner to avoid
    1242             :     // artificial dependencies introduced by RMW
    1243        9178 :     if (StoreNode->isTruncatingStore()) {
    1244             :       assert(VT.bitsLE(MVT::i32));
    1245         214 :       SDValue MaskConstant;
    1246             :       if (MemVT == MVT::i8) {
    1247         115 :         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
    1248             :       } else {
    1249             :         assert(MemVT == MVT::i16);
    1250             :         assert(StoreNode->getAlignment() >= 2);
    1251          99 :         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
    1252             :       }
    1253             : 
    1254             :       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
    1255         214 :                                       DAG.getConstant(0x00000003, DL, PtrVT));
    1256             :       SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
    1257         214 :                                      DAG.getConstant(3, DL, VT));
    1258             : 
    1259             :       // Put the mask in correct place
    1260         214 :       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
    1261             : 
    1262             :       // Put the value bits in correct place
    1263         214 :       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
    1264         214 :       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
    1265             : 
    1266             :       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
    1267             :       // vector instead.
    1268             :       SDValue Src[4] = {
    1269             :         ShiftedValue,
    1270         214 :         DAG.getConstant(0, DL, MVT::i32),
    1271         214 :         DAG.getConstant(0, DL, MVT::i32),
    1272             :         Mask
    1273         856 :       };
    1274         214 :       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
    1275         214 :       SDValue Args[3] = { Chain, Input, DWordAddr };
    1276             :       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
    1277             :                                      Op->getVTList(), Args, MemVT,
    1278         642 :                                      StoreNode->getMemOperand());
    1279       11861 :     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
    1280             :       // Convert pointer from byte address to dword address.
    1281        2862 :       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1282             : 
    1283        5724 :       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
    1284           0 :         llvm_unreachable("Truncated and indexed stores not supported yet");
    1285             :       } else {
    1286        2862 :         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1287             :       }
    1288        2862 :       return Chain;
    1289             :     }
    1290             :   }
    1291             : 
    1292             :   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
    1293       29258 :   if (AS != AMDGPUASI.PRIVATE_ADDRESS)
    1294       18591 :     return SDValue();
    1295             : 
    1296       10667 :   if (MemVT.bitsLT(MVT::i32))
    1297        1313 :     return lowerPrivateTruncStore(StoreNode, DAG);
    1298             : 
    1299             :   // Standard i32+ store, tag it with DWORDADDR to note that the address
    1300             :   // has been shifted
    1301        9354 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1302        2776 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1303        2776 :     return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1304             :   }
    1305             : 
    1306             :   // Tagged i32+ stores will be matched by patterns
    1307        6578 :   return SDValue();
    1308             : }
    1309             : 
    1310             : // return (512 + (kc_bank << 12)
    1311             : static int
    1312             : ConstantAddressBlock(unsigned AddressSpace) {
    1313             :   switch (AddressSpace) {
    1314             :   case AMDGPUAS::CONSTANT_BUFFER_0:
    1315             :     return 512;
    1316             :   case AMDGPUAS::CONSTANT_BUFFER_1:
    1317             :     return 512 + 4096;
    1318             :   case AMDGPUAS::CONSTANT_BUFFER_2:
    1319             :     return 512 + 4096 * 2;
    1320             :   case AMDGPUAS::CONSTANT_BUFFER_3:
    1321             :     return 512 + 4096 * 3;
    1322             :   case AMDGPUAS::CONSTANT_BUFFER_4:
    1323             :     return 512 + 4096 * 4;
    1324             :   case AMDGPUAS::CONSTANT_BUFFER_5:
    1325             :     return 512 + 4096 * 5;
    1326             :   case AMDGPUAS::CONSTANT_BUFFER_6:
    1327             :     return 512 + 4096 * 6;
    1328             :   case AMDGPUAS::CONSTANT_BUFFER_7:
    1329             :     return 512 + 4096 * 7;
    1330             :   case AMDGPUAS::CONSTANT_BUFFER_8:
    1331             :     return 512 + 4096 * 8;
    1332             :   case AMDGPUAS::CONSTANT_BUFFER_9:
    1333             :     return 512 + 4096 * 9;
    1334             :   case AMDGPUAS::CONSTANT_BUFFER_10:
    1335             :     return 512 + 4096 * 10;
    1336             :   case AMDGPUAS::CONSTANT_BUFFER_11:
    1337             :     return 512 + 4096 * 11;
    1338             :   case AMDGPUAS::CONSTANT_BUFFER_12:
    1339             :     return 512 + 4096 * 12;
    1340             :   case AMDGPUAS::CONSTANT_BUFFER_13:
    1341             :     return 512 + 4096 * 13;
    1342             :   case AMDGPUAS::CONSTANT_BUFFER_14:
    1343             :     return 512 + 4096 * 14;
    1344             :   case AMDGPUAS::CONSTANT_BUFFER_15:
    1345             :     return 512 + 4096 * 15;
    1346             :   default:
    1347             :     return -1;
    1348             :   }
    1349             : }
    1350             : 
    1351        4054 : SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
    1352             :                                                 SelectionDAG &DAG) const {
    1353             :   SDLoc DL(Op);
    1354             :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1355             :   ISD::LoadExtType ExtType = Load->getExtensionType();
    1356        4054 :   EVT MemVT = Load->getMemoryVT();
    1357             :   assert(Load->getAlignment() >= MemVT.getStoreSize());
    1358             : 
    1359        4054 :   SDValue BasePtr = Load->getBasePtr();
    1360        4054 :   SDValue Chain = Load->getChain();
    1361        4054 :   SDValue Offset = Load->getOffset();
    1362             : 
    1363        4054 :   SDValue LoadPtr = BasePtr;
    1364        4054 :   if (!Offset.isUndef()) {
    1365           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1366             :   }
    1367             : 
    1368             :   // Get dword location
    1369             :   // NOTE: this should be eliminated by the future SHR ptr, 2
    1370             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1371        8108 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1372             : 
    1373             :   // Load dword
    1374             :   // TODO: can we be smarter about machine pointer info?
    1375        4054 :   MachinePointerInfo PtrInfo(UndefValue::get(
    1376        4054 :       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
    1377        4054 :   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
    1378             : 
    1379             :   // Get offset within the register.
    1380             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
    1381        8108 :                                 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
    1382             : 
    1383             :   // Bit offset of target byte (byteIdx * 8).
    1384             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1385        8108 :                                  DAG.getConstant(3, DL, MVT::i32));
    1386             : 
    1387             :   // Shift to the right.
    1388        4054 :   SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
    1389             : 
    1390             :   // Eliminate the upper bits by setting them to ...
    1391        4054 :   EVT MemEltVT = MemVT.getScalarType();
    1392             : 
    1393        4054 :   if (ExtType == ISD::SEXTLOAD) { // ... ones.
    1394        1280 :     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
    1395        1280 :     Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
    1396             :   } else { // ... or zeros.
    1397        2774 :     Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
    1398             :   }
    1399             : 
    1400             :   SDValue Ops[] = {
    1401             :     Ret,
    1402             :     Read.getValue(1) // This should be our output chain
    1403        4054 :   };
    1404             : 
    1405        8108 :   return DAG.getMergeValues(Ops, DL);
    1406             : }
    1407             : 
    1408       39125 : SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    1409             :   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
    1410             :   unsigned AS = LoadNode->getAddressSpace();
    1411       39125 :   EVT MemVT = LoadNode->getMemoryVT();
    1412             :   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
    1413             : 
    1414       64747 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
    1415       78250 :       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
    1416        4054 :     return lowerPrivateExtLoad(Op, DAG);
    1417             :   }
    1418             : 
    1419             :   SDLoc DL(Op);
    1420       35071 :   EVT VT = Op.getValueType();
    1421       35071 :   SDValue Chain = LoadNode->getChain();
    1422       35071 :   SDValue Ptr = LoadNode->getBasePtr();
    1423             : 
    1424       30592 :   if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
    1425       91710 :       LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
    1426             :       VT.isVector()) {
    1427         374 :       return scalarizeVectorLoad(LoadNode, DAG);
    1428             :   }
    1429             : 
    1430             :   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
    1431       12182 :   if (ConstantBlock > -1 &&
    1432         962 :       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
    1433             :        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
    1434             :     SDValue Result;
    1435       10020 :     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
    1436        5136 :         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
    1437             :         isa<ConstantSDNode>(Ptr)) {
    1438        5136 :       SDValue Slots[4];
    1439       46224 :       for (unsigned i = 0; i < 4; i++) {
    1440             :         // We want Const position encoded with the following formula :
    1441             :         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
    1442             :         // const_index is Ptr computed by llvm using an alignment of 16.
    1443             :         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
    1444             :         // then div by 4 at the ISel step
    1445             :         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
    1446       41088 :             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
    1447       20544 :         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
    1448             :       }
    1449        5136 :       EVT NewVT = MVT::v4i32;
    1450             :       unsigned NumElements = 4;
    1451        5136 :       if (VT.isVector()) {
    1452         541 :         NewVT = VT;
    1453         541 :         NumElements = VT.getVectorNumElements();
    1454             :       }
    1455        5136 :       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
    1456             :     } else {
    1457             :       // non-constant ptr can't be folded, keeps it as a v4f32 load
    1458           0 :       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
    1459             :           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
    1460             :                       DAG.getConstant(4, DL, MVT::i32)),
    1461           0 :                       DAG.getConstant(LoadNode->getAddressSpace() -
    1462             :                                       AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
    1463           0 :           );
    1464             :     }
    1465             : 
    1466        5136 :     if (!VT.isVector()) {
    1467        4595 :       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
    1468        9190 :                            DAG.getConstant(0, DL, MVT::i32));
    1469             :     }
    1470             : 
    1471             :     SDValue MergedValues[2] = {
    1472             :       Result,
    1473             :       Chain
    1474        5136 :     };
    1475        5136 :     return DAG.getMergeValues(MergedValues, DL);
    1476             :   }
    1477             : 
    1478             :   // For most operations returning SDValue() will result in the node being
    1479             :   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
    1480             :   // need to manually expand loads that may be legal in some address spaces and
    1481             :   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
    1482             :   // compute shaders, since the data is sign extended when it is uploaded to the
    1483             :   // buffer. However SEXT loads from other address spaces are not supported, so
    1484             :   // we need to expand them here.
    1485       29561 :   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
    1486         290 :     EVT MemVT = LoadNode->getMemoryVT();
    1487             :     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
    1488             :     SDValue NewLoad = DAG.getExtLoad(
    1489         290 :         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
    1490         580 :         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
    1491             :     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
    1492         290 :                               DAG.getValueType(MemVT));
    1493             : 
    1494         290 :     SDValue MergedValues[2] = { Res, Chain };
    1495         290 :     return DAG.getMergeValues(MergedValues, DL);
    1496             :   }
    1497             : 
    1498       29271 :   if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
    1499        7888 :     return SDValue();
    1500             :   }
    1501             : 
    1502             :   // DWORDADDR ISD marks already shifted address
    1503       21383 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1504             :     assert(VT == MVT::i32);
    1505       11794 :     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
    1506        5897 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
    1507       11794 :     return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
    1508             :   }
    1509       15486 :   return SDValue();
    1510             : }
    1511             : 
    1512          86 : SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    1513          86 :   SDValue Chain = Op.getOperand(0);
    1514          86 :   SDValue Cond  = Op.getOperand(1);
    1515          86 :   SDValue Jump  = Op.getOperand(2);
    1516             : 
    1517          86 :   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
    1518         172 :                      Chain, Jump, Cond);
    1519             : }
    1520             : 
    1521        1606 : SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
    1522             :                                             SelectionDAG &DAG) const {
    1523        1606 :   MachineFunction &MF = DAG.getMachineFunction();
    1524        1606 :   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
    1525             : 
    1526             :   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
    1527             : 
    1528        1606 :   unsigned FrameIndex = FIN->getIndex();
    1529             :   unsigned IgnoredFrameReg;
    1530             :   unsigned Offset =
    1531        1606 :     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
    1532        3212 :   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
    1533        3212 :                          Op.getValueType());
    1534             : }
    1535             : 
    1536             : /// XXX Only kernel functions are supported, so we can assume for now that
    1537             : /// every function is a kernel function, but in the future we should use
    1538             : /// separate calling conventions for kernel and non-kernel functions.
    1539        2232 : SDValue R600TargetLowering::LowerFormalArguments(
    1540             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1541             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1542             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1543             :   SmallVector<CCValAssign, 16> ArgLocs;
    1544             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1545        4464 :                  *DAG.getContext());
    1546        2232 :   MachineFunction &MF = DAG.getMachineFunction();
    1547        2232 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    1548             : 
    1549             :   SmallVector<ISD::InputArg, 8> LocalIns;
    1550             : 
    1551        2232 :   if (AMDGPU::isShader(CallConv)) {
    1552          49 :     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
    1553             :   } else {
    1554        2183 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1555             :   }
    1556             : 
    1557        7882 :   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
    1558        5650 :     CCValAssign &VA = ArgLocs[i];
    1559             :     const ISD::InputArg &In = Ins[i];
    1560             :     EVT VT = In.VT;
    1561             :     EVT MemVT = VA.getLocVT();
    1562       10936 :     if (!VT.isVector() && MemVT.isVector()) {
    1563             :       // Get load source type if scalarized.
    1564           0 :       MemVT = MemVT.getVectorElementType();
    1565             :     }
    1566             : 
    1567        5650 :     if (AMDGPU::isShader(CallConv)) {
    1568          65 :       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
    1569          65 :       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1570          65 :       InVals.push_back(Register);
    1571          65 :       continue;
    1572             :     }
    1573             : 
    1574        5585 :     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
    1575        5585 :                                           AMDGPUASI.CONSTANT_BUFFER_0);
    1576             : 
    1577             :     // i64 isn't a legal type, so the register type used ends up as i32, which
    1578             :     // isn't expected here. It attempts to create this sextload, but it ends up
    1579             :     // being invalid. Somehow this seems to work with i64 arguments, but breaks
    1580             :     // for <1 x i64>.
    1581             : 
    1582             :     // The first 36 bytes of the input buffer contains information about
    1583             :     // thread group and global sizes.
    1584             :     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
    1585        5585 :     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
    1586             :       // FIXME: This should really check the extload type, but the handling of
    1587             :       // extload vector parameters seems to be broken.
    1588             : 
    1589             :       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
    1590             :       Ext = ISD::SEXTLOAD;
    1591             :     }
    1592             : 
    1593             :     // Compute the offset from the value.
    1594             :     // XXX - I think PartOffset should give you this, but it seems to give the
    1595             :     // size of the register which isn't useful.
    1596             : 
    1597       11170 :     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
    1598        5585 :     unsigned PartOffset = VA.getLocMemOffset();
    1599        5585 :     unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF.getFunction()) +
    1600        5585 :                       VA.getLocMemOffset();
    1601             : 
    1602        5585 :     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
    1603             :     SDValue Arg = DAG.getLoad(
    1604             :         ISD::UNINDEXED, Ext, VT, DL, Chain,
    1605             :         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
    1606             :         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
    1607             :                                         MachineMemOperand::MODereferenceable |
    1608       11170 :                                         MachineMemOperand::MOInvariant);
    1609             : 
    1610             :     // 4 is the preferred alignment for the CONSTANT memory space.
    1611        5585 :     InVals.push_back(Arg);
    1612        5585 :     MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
    1613             :   }
    1614        4464 :   return Chain;
    1615             : }
    1616             : 
    1617       34818 : EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    1618             :                                            EVT VT) const {
    1619       34818 :    if (!VT.isVector())
    1620       34778 :      return MVT::i32;
    1621          40 :    return VT.changeVectorElementTypeToInteger();
    1622             : }
    1623             : 
    1624         101 : bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
    1625             :                                           const SelectionDAG &DAG) const {
    1626             :   // Local and Private addresses do not handle vectors. Limit to i32
    1627         101 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
    1628         100 :     return (MemVT.getSizeInBits() <= 32);
    1629             :   }
    1630             :   return true;
    1631             : }
    1632             : 
    1633         844 : bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    1634             :                                                         unsigned AddrSpace,
    1635             :                                                         unsigned Align,
    1636             :                                                         bool *IsFast) const {
    1637         844 :   if (IsFast)
    1638         609 :     *IsFast = false;
    1639             : 
    1640         844 :   if (!VT.isSimple() || VT == MVT::Other)
    1641             :     return false;
    1642             : 
    1643         843 :   if (VT.bitsLT(MVT::i32))
    1644             :     return false;
    1645             : 
    1646             :   // TODO: This is a rough estimate.
    1647         808 :   if (IsFast)
    1648         597 :     *IsFast = true;
    1649             : 
    1650         808 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1651             : }
    1652             : 
    1653         392 : static SDValue CompactSwizzlableVector(
    1654             :   SelectionDAG &DAG, SDValue VectorEntry,
    1655             :   DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1656             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1657             :   assert(RemapSwizzle.empty());
    1658             :   SDValue NewBldVec[4] = {
    1659             :     VectorEntry.getOperand(0),
    1660             :     VectorEntry.getOperand(1),
    1661             :     VectorEntry.getOperand(2),
    1662             :     VectorEntry.getOperand(3)
    1663         392 :   };
    1664             : 
    1665        1960 :   for (unsigned i = 0; i < 4; i++) {
    1666        3136 :     if (NewBldVec[i].isUndef())
    1667             :       // We mask write here to teach later passes that the ith element of this
    1668             :       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
    1669             :       // break false dependencies and additionnaly make assembly easier to read.
    1670         212 :       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
    1671        1568 :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
    1672          56 :       if (C->isZero()) {
    1673          27 :         RemapSwizzle[i] = 4; // SEL_0
    1674          27 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1675           1 :       } else if (C->isExactlyValue(1.0)) {
    1676           1 :         RemapSwizzle[i] = 5; // SEL_1
    1677           1 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1678             :       }
    1679             :     }
    1680             : 
    1681        3136 :     if (NewBldVec[i].isUndef())
    1682             :       continue;
    1683        4974 :     for (unsigned j = 0; j < i; j++) {
    1684             :       if (NewBldVec[i] == NewBldVec[j]) {
    1685          32 :         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
    1686          16 :         RemapSwizzle[i] = j;
    1687             :         break;
    1688             :       }
    1689             :     }
    1690             :   }
    1691             : 
    1692         392 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1693         784 :                             NewBldVec);
    1694             : }
    1695             : 
    1696         392 : static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
    1697             :                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1698             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1699             :   assert(RemapSwizzle.empty());
    1700             :   SDValue NewBldVec[4] = {
    1701             :       VectorEntry.getOperand(0),
    1702             :       VectorEntry.getOperand(1),
    1703             :       VectorEntry.getOperand(2),
    1704             :       VectorEntry.getOperand(3)
    1705         392 :   };
    1706         392 :   bool isUnmovable[4] = { false, false, false, false };
    1707        1960 :   for (unsigned i = 0; i < 4; i++) {
    1708        1568 :     RemapSwizzle[i] = i;
    1709        3136 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1710             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1711          94 :           ->getZExtValue();
    1712          47 :       if (i == Idx)
    1713          39 :         isUnmovable[Idx] = true;
    1714             :     }
    1715             :   }
    1716             : 
    1717        1949 :   for (unsigned i = 0; i < 4; i++) {
    1718        3126 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1719             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1720          92 :           ->getZExtValue();
    1721          46 :       if (isUnmovable[Idx])
    1722          40 :         continue;
    1723             :       // Swap i and Idx
    1724           6 :       std::swap(NewBldVec[Idx], NewBldVec[i]);
    1725             :       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
    1726           6 :       break;
    1727             :     }
    1728             :   }
    1729             : 
    1730         392 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1731         784 :                             NewBldVec);
    1732             : }
    1733             : 
    1734         392 : SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
    1735             :                                             SelectionDAG &DAG,
    1736             :                                             const SDLoc &DL) const {
    1737             :   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
    1738             :   // Old -> New swizzle values
    1739             :   DenseMap<unsigned, unsigned> SwizzleRemap;
    1740             : 
    1741         392 :   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
    1742        3528 :   for (unsigned i = 0; i < 4; i++) {
    1743        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1744        1568 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1745         119 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1746             :   }
    1747             : 
    1748         392 :   SwizzleRemap.clear();
    1749         392 :   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
    1750        3528 :   for (unsigned i = 0; i < 4; i++) {
    1751        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1752        1568 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1753        1350 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1754             :   }
    1755             : 
    1756         784 :   return BuildVector;
    1757             : }
    1758             : 
    1759             : //===----------------------------------------------------------------------===//
    1760             : // Custom DAG Optimizations
    1761             : //===----------------------------------------------------------------------===//
    1762             : 
    1763      201008 : SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
    1764             :                                               DAGCombinerInfo &DCI) const {
    1765      201008 :   SelectionDAG &DAG = DCI.DAG;
    1766             :   SDLoc DL(N);
    1767             : 
    1768      402016 :   switch (N->getOpcode()) {
    1769             :   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
    1770           2 :   case ISD::FP_ROUND: {
    1771           2 :       SDValue Arg = N->getOperand(0);
    1772           2 :       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
    1773             :         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
    1774           2 :                            Arg.getOperand(0));
    1775             :       }
    1776             :       break;
    1777             :     }
    1778             : 
    1779             :   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
    1780             :   // (i32 select_cc f32, f32, -1, 0 cc)
    1781             :   //
    1782             :   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
    1783             :   // this to one of the SET*_DX10 instructions.
    1784          58 :   case ISD::FP_TO_SINT: {
    1785          58 :     SDValue FNeg = N->getOperand(0);
    1786          58 :     if (FNeg.getOpcode() != ISD::FNEG) {
    1787          42 :       return SDValue();
    1788             :     }
    1789          16 :     SDValue SelectCC = FNeg.getOperand(0);
    1790             :     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
    1791           8 :         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
    1792          16 :         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
    1793          32 :         !isHWTrueValue(SelectCC.getOperand(2)) ||
    1794           8 :         !isHWFalseValue(SelectCC.getOperand(3))) {
    1795           8 :       return SDValue();
    1796             :     }
    1797             : 
    1798             :     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
    1799             :                            SelectCC.getOperand(0), // LHS
    1800             :                            SelectCC.getOperand(1), // RHS
    1801             :                            DAG.getConstant(-1, DL, MVT::i32), // True
    1802             :                            DAG.getConstant(0, DL, MVT::i32),  // False
    1803          32 :                            SelectCC.getOperand(4)); // CC
    1804             : 
    1805             :     break;
    1806             :   }
    1807             : 
    1808             :   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
    1809             :   // => build_vector elt0, ... , NewEltIdx, ... , eltN
    1810         276 :   case ISD::INSERT_VECTOR_ELT: {
    1811         276 :     SDValue InVec = N->getOperand(0);
    1812         276 :     SDValue InVal = N->getOperand(1);
    1813         276 :     SDValue EltNo = N->getOperand(2);
    1814             : 
    1815             :     // If the inserted element is an UNDEF, just use the input vector.
    1816         276 :     if (InVal.isUndef())
    1817           0 :       return InVec;
    1818             : 
    1819         276 :     EVT VT = InVec.getValueType();
    1820             : 
    1821             :     // If we can't generate a legal BUILD_VECTOR, exit
    1822             :     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
    1823           8 :       return SDValue();
    1824             : 
    1825             :     // Check that we know which element is being inserted
    1826             :     if (!isa<ConstantSDNode>(EltNo))
    1827           2 :       return SDValue();
    1828         266 :     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
    1829             : 
    1830             :     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
    1831             :     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
    1832             :     // vector elements.
    1833             :     SmallVector<SDValue, 8> Ops;
    1834         266 :     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
    1835           8 :       Ops.append(InVec.getNode()->op_begin(),
    1836             :                  InVec.getNode()->op_end());
    1837         262 :     } else if (InVec.isUndef()) {
    1838           0 :       unsigned NElts = VT.getVectorNumElements();
    1839           0 :       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
    1840             :     } else {
    1841         262 :       return SDValue();
    1842             :     }
    1843             : 
    1844             :     // Insert the element
    1845           4 :     if (Elt < Ops.size()) {
    1846             :       // All the operands of BUILD_VECTOR must have the same type;
    1847             :       // we enforce that here.
    1848           8 :       EVT OpVT = Ops[0].getValueType();
    1849           0 :       if (InVal.getValueType() != OpVT)
    1850           0 :         InVal = OpVT.bitsGT(InVal.getValueType()) ?
    1851           0 :           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
    1852           0 :           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
    1853           4 :       Ops[Elt] = InVal;
    1854             :     }
    1855             : 
    1856             :     // Return the new vector
    1857           4 :     return DAG.getBuildVector(VT, DL, Ops);
    1858             :   }
    1859             : 
    1860             :   // Extract_vec (Build_vector) generated by custom lowering
    1861             :   // also needs to be customly combined
    1862       11663 :   case ISD::EXTRACT_VECTOR_ELT: {
    1863       11663 :     SDValue Arg = N->getOperand(0);
    1864       11663 :     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
    1865             :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1866           0 :         unsigned Element = Const->getZExtValue();
    1867           0 :         return Arg->getOperand(Element);
    1868             :       }
    1869             :     }
    1870         214 :     if (Arg.getOpcode() == ISD::BITCAST &&
    1871       11907 :         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
    1872       11723 :         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
    1873       11693 :          Arg.getValueType().getVectorNumElements())) {
    1874             :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1875          29 :         unsigned Element = Const->getZExtValue();
    1876             :         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
    1877          58 :                            Arg->getOperand(0).getOperand(Element));
    1878             :       }
    1879             :     }
    1880             :     break;
    1881             :   }
    1882             : 
    1883       11333 :   case ISD::SELECT_CC: {
    1884             :     // Try common optimizations
    1885       11333 :     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
    1886           0 :       return Ret;
    1887             : 
    1888             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
    1889             :     //      selectcc x, y, a, b, inv(cc)
    1890             :     //
    1891             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
    1892             :     //      selectcc x, y, a, b, cc
    1893       11333 :     SDValue LHS = N->getOperand(0);
    1894       11333 :     if (LHS.getOpcode() != ISD::SELECT_CC) {
    1895        5588 :       return SDValue();
    1896             :     }
    1897             : 
    1898        5745 :     SDValue RHS = N->getOperand(1);
    1899        5745 :     SDValue True = N->getOperand(2);
    1900        5745 :     SDValue False = N->getOperand(3);
    1901        5745 :     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
    1902             : 
    1903        5771 :     if (LHS.getOperand(2).getNode() != True.getNode() ||
    1904        5757 :         LHS.getOperand(3).getNode() != False.getNode() ||
    1905             :         RHS.getNode() != False.getNode()) {
    1906        5741 :       return SDValue();
    1907             :     }
    1908             : 
    1909           4 :     switch (NCC) {
    1910           0 :     default: return SDValue();
    1911           1 :     case ISD::SETNE: return LHS;
    1912           3 :     case ISD::SETEQ: {
    1913           3 :       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
    1914           3 :       LHSCC = ISD::getSetCCInverse(LHSCC,
    1915           9 :                                   LHS.getOperand(0).getValueType().isInteger());
    1916           9 :       if (DCI.isBeforeLegalizeOps() ||
    1917             :           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
    1918             :         return DAG.getSelectCC(DL,
    1919             :                                LHS.getOperand(0),
    1920             :                                LHS.getOperand(1),
    1921             :                                LHS.getOperand(2),
    1922             :                                LHS.getOperand(3),
    1923           0 :                                LHSCC);
    1924             :       break;
    1925             :     }
    1926             :     }
    1927           3 :     return SDValue();
    1928             :   }
    1929             : 
    1930         138 :   case AMDGPUISD::R600_EXPORT: {
    1931         138 :     SDValue Arg = N->getOperand(1);
    1932         138 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1933             :       break;
    1934             : 
    1935             :     SDValue NewArgs[8] = {
    1936             :       N->getOperand(0), // Chain
    1937             :       SDValue(),
    1938             :       N->getOperand(2), // ArrayBase
    1939             :       N->getOperand(3), // Type
    1940             :       N->getOperand(4), // SWZ_X
    1941             :       N->getOperand(5), // SWZ_Y
    1942             :       N->getOperand(6), // SWZ_Z
    1943             :       N->getOperand(7) // SWZ_W
    1944         120 :     };
    1945         120 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
    1946         240 :     return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
    1947             :   }
    1948         296 :   case AMDGPUISD::TEXTURE_FETCH: {
    1949         296 :     SDValue Arg = N->getOperand(1);
    1950         296 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1951             :       break;
    1952             : 
    1953             :     SDValue NewArgs[19] = {
    1954             :       N->getOperand(0),
    1955             :       N->getOperand(1),
    1956             :       N->getOperand(2),
    1957             :       N->getOperand(3),
    1958             :       N->getOperand(4),
    1959             :       N->getOperand(5),
    1960             :       N->getOperand(6),
    1961             :       N->getOperand(7),
    1962             :       N->getOperand(8),
    1963             :       N->getOperand(9),
    1964             :       N->getOperand(10),
    1965             :       N->getOperand(11),
    1966             :       N->getOperand(12),
    1967             :       N->getOperand(13),
    1968             :       N->getOperand(14),
    1969             :       N->getOperand(15),
    1970             :       N->getOperand(16),
    1971             :       N->getOperand(17),
    1972             :       N->getOperand(18),
    1973         272 :     };
    1974         272 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
    1975         544 :     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
    1976             :   }
    1977             :   default: break;
    1978             :   }
    1979             : 
    1980      188919 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    1981             : }
    1982             : 
    1983      240499 : bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
    1984             :                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
    1985             :                                      SDValue &Sel, SDValue &Imm,
    1986             :                                      SelectionDAG &DAG) const {
    1987      240499 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    1988      480998 :   if (!Src.isMachineOpcode())
    1989             :     return false;
    1990             : 
    1991      154472 :   switch (Src.getMachineOpcode()) {
    1992         125 :   case AMDGPU::FNEG_R600:
    1993         125 :     if (!Neg.getNode())
    1994             :       return false;
    1995         101 :     Src = Src.getOperand(0);
    1996         202 :     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    1997         101 :     return true;
    1998         109 :   case AMDGPU::FABS_R600:
    1999         109 :     if (!Abs.getNode())
    2000             :       return false;
    2001          93 :     Src = Src.getOperand(0);
    2002         186 :     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    2003          93 :     return true;
    2004       10608 :   case AMDGPU::CONST_COPY: {
    2005       10608 :     unsigned Opcode = ParentNode->getMachineOpcode();
    2006       10608 :     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2007             : 
    2008       10608 :     if (!Sel.getNode())
    2009             :       return false;
    2010             : 
    2011       18526 :     SDValue CstOffset = Src.getOperand(0);
    2012       27789 :     if (ParentNode->getValueType(0).isVector())
    2013             :       return false;
    2014             : 
    2015             :     // Gather constants values
    2016             :     int SrcIndices[] = {
    2017        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2018        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2019        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
    2020        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2021        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2022        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2023        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2024        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2025        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2026        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2027        9263 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2028      101893 :     };
    2029             :     std::vector<unsigned> Consts;
    2030      213049 :     for (int OtherSrcIdx : SrcIndices) {
    2031      101893 :       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
    2032      101893 :       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
    2033       83135 :         continue;
    2034       18758 :       if (HasDst) {
    2035       18758 :         OtherSrcIdx--;
    2036       18758 :         OtherSelIdx--;
    2037             :       }
    2038             :       if (RegisterSDNode *Reg =
    2039       18758 :           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
    2040         725 :         if (Reg->getReg() == AMDGPU::ALU_CONST) {
    2041             :           ConstantSDNode *Cst
    2042         557 :             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
    2043        1671 :           Consts.push_back(Cst->getZExtValue());
    2044             :         }
    2045             :       }
    2046             :     }
    2047             : 
    2048             :     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
    2049       27789 :     Consts.push_back(Cst->getZExtValue());
    2050        9263 :     if (!TII->fitsConstReadLimitations(Consts)) {
    2051             :       return false;
    2052             :     }
    2053             : 
    2054        9243 :     Sel = CstOffset;
    2055        9243 :     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
    2056        9243 :     return true;
    2057             :   }
    2058             :   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
    2059             :     // Check if the Imm slot is used. Taken from below.
    2060          28 :     if (cast<ConstantSDNode>(Imm)->getZExtValue())
    2061             :       return false;
    2062          14 :     Imm = Src.getOperand(0);
    2063          14 :     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
    2064          14 :     return true;
    2065       27383 :   case AMDGPU::MOV_IMM_I32:
    2066             :   case AMDGPU::MOV_IMM_F32: {
    2067             :     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
    2068             :     uint64_t ImmValue = 0;
    2069             : 
    2070       27383 :     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
    2071             :       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
    2072        1056 :       float FloatValue = FPC->getValueAPF().convertToFloat();
    2073         528 :       if (FloatValue == 0.0) {
    2074             :         ImmReg = AMDGPU::ZERO;
    2075         386 :       } else if (FloatValue == 0.5) {
    2076             :         ImmReg = AMDGPU::HALF;
    2077         349 :       } else if (FloatValue == 1.0) {
    2078             :         ImmReg = AMDGPU::ONE;
    2079             :       } else {
    2080         792 :         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
    2081             :       }
    2082             :     } else {
    2083             :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
    2084       26855 :       uint64_t Value = C->getZExtValue();
    2085       26855 :       if (Value == 0) {
    2086             :         ImmReg = AMDGPU::ZERO;
    2087       25165 :       } else if (Value == 1) {
    2088             :         ImmReg = AMDGPU::ONE_INT;
    2089             :       } else {
    2090             :         ImmValue = Value;
    2091             :       }
    2092             :     }
    2093             : 
    2094             :     // Check that we aren't already using an immediate.
    2095             :     // XXX: It's possible for an instruction to have more than one
    2096             :     // immediate operand, but this is not supported yet.
    2097             :     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
    2098       22561 :       if (!Imm.getNode())
    2099             :         return false;
    2100             :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
    2101             :       assert(C);
    2102       45014 :       if (C->getZExtValue())
    2103             :         return false;
    2104       40526 :       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
    2105             :     }
    2106       25085 :     Src = DAG.getRegister(ImmReg, MVT::i32);
    2107       25085 :     return true;
    2108             :   }
    2109             :   default:
    2110             :     return false;
    2111             :   }
    2112             : }
    2113             : 
    2114             : /// Fold the instructions after selecting them
    2115      162073 : SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
    2116             :                                             SelectionDAG &DAG) const {
    2117      162073 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    2118      162073 :   if (!Node->isMachineOpcode())
    2119             :     return Node;
    2120             : 
    2121             :   unsigned Opcode = Node->getMachineOpcode();
    2122      162073 :   SDValue FakeOp;
    2123             : 
    2124      162073 :   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
    2125             : 
    2126      162073 :   if (Opcode == AMDGPU::DOT_4) {
    2127             :     int OperandIdx[] = {
    2128         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2129         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2130         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2131         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2132         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2133         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2134         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2135         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2136        1184 :         };
    2137             :     int NegIdx[] = {
    2138         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
    2139         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
    2140         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
    2141         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
    2142         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
    2143         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
    2144         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
    2145         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
    2146        1184 :     };
    2147             :     int AbsIdx[] = {
    2148         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
    2149         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
    2150         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
    2151         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
    2152         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
    2153         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
    2154         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
    2155         148 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
    2156        1184 :     };
    2157        1992 :     for (unsigned i = 0; i < 8; i++) {
    2158        1002 :       if (OperandIdx[i] < 0)
    2159          80 :         return Node;
    2160        1002 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2161        1002 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2162        1002 :       SDValue &Abs = Ops[AbsIdx[i] - 1];
    2163        1002 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2164        1002 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2165        1002 :       if (HasDst)
    2166        1002 :         SelIdx--;
    2167        1002 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2168        1002 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
    2169         240 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2170             :     }
    2171      161925 :   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
    2172       49848 :     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
    2173       20333 :       SDValue &Src = Ops[i];
    2174       20333 :       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
    2175        3885 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2176             :     }
    2177             :   } else {
    2178      156039 :     if (!TII->hasInstrModifiers(Opcode))
    2179      141716 :       return Node;
    2180             :     int OperandIdx[] = {
    2181      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2182      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2183      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
    2184      318081 :     };
    2185             :     int NegIdx[] = {
    2186      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
    2187      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
    2188      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
    2189      318081 :     };
    2190      106027 :     int AbsIdx[] = {
    2191      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
    2192      106027 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
    2193             :       -1
    2194      212054 :     };
    2195      478033 :     for (unsigned i = 0; i < 3; i++) {
    2196      277707 :       if (OperandIdx[i] < 0)
    2197       91704 :         return Node;
    2198      219164 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2199      219164 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2200      219164 :       SDValue FakeAbs;
    2201      219164 :       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
    2202      219164 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2203      219164 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2204      219164 :       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
    2205      219164 :       if (HasDst) {
    2206      219164 :         SelIdx--;
    2207      219164 :         ImmIdx--;
    2208             :       }
    2209      219164 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2210      219164 :       SDValue &Imm = Ops[ImmIdx];
    2211      219164 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
    2212       99483 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2213             :     }
    2214             :   }
    2215             : 
    2216             :   return Node;
    2217             : }

Generated by: LCOV version 1.13