LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - R600ISelLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1147 1246 92.1 %
Date: 2017-09-14 15:23:50 Functions: 37 39 94.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Custom DAG lowering for R600
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "R600ISelLowering.h"
      16             : #include "AMDGPUFrameLowering.h"
      17             : #include "AMDGPUIntrinsicInfo.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "R600Defines.h"
      20             : #include "R600FrameLowering.h"
      21             : #include "R600InstrInfo.h"
      22             : #include "R600MachineFunctionInfo.h"
      23             : #include "Utils/AMDGPUBaseInfo.h"
      24             : #include "llvm/ADT/APFloat.h"
      25             : #include "llvm/ADT/APInt.h"
      26             : #include "llvm/ADT/ArrayRef.h"
      27             : #include "llvm/ADT/DenseMap.h"
      28             : #include "llvm/ADT/SmallVector.h"
      29             : #include "llvm/CodeGen/CallingConvLower.h"
      30             : #include "llvm/CodeGen/DAGCombine.h"
      31             : #include "llvm/CodeGen/ISDOpcodes.h"
      32             : #include "llvm/CodeGen/MachineBasicBlock.h"
      33             : #include "llvm/CodeGen/MachineFunction.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineMemOperand.h"
      37             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      38             : #include "llvm/CodeGen/MachineValueType.h"
      39             : #include "llvm/CodeGen/SelectionDAG.h"
      40             : #include "llvm/IR/Constants.h"
      41             : #include "llvm/IR/DerivedTypes.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include "llvm/Support/Compiler.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include <cassert>
      46             : #include <cstdint>
      47             : #include <iterator>
      48             : #include <utility>
      49             : #include <vector>
      50             : 
      51             : using namespace llvm;
      52             : 
      53         253 : R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
      54         253 :                                        const R600Subtarget &STI)
      55         253 :     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
      56         506 :   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
      57         506 :   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
      58         506 :   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
      59         506 :   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
      60         506 :   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
      61         506 :   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
      62             : 
      63         506 :   computeRegisterProperties(STI.getRegisterInfo());
      64             : 
      65             :   // Legalize loads and stores to the private address space.
      66         506 :   setOperationAction(ISD::LOAD, MVT::i32, Custom);
      67         506 :   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
      68         506 :   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
      69             : 
      70             :   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
      71             :   // spaces, so it is custom lowered to handle those where it isn't.
      72        1771 :   for (MVT VT : MVT::integer_valuetypes()) {
      73        3036 :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
      74        3036 :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
      75        3036 :     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
      76             : 
      77        3036 :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
      78        3036 :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
      79        3036 :     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
      80             : 
      81        3036 :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
      82        3036 :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
      83        3036 :     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
      84             :   }
      85             : 
      86             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
      87         506 :   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      88         506 :   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      89         506 :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
      90             : 
      91         506 :   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      92         506 :   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      93         506 :   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
      94             : 
      95         506 :   setOperationAction(ISD::STORE, MVT::i8, Custom);
      96         506 :   setOperationAction(ISD::STORE, MVT::i32, Custom);
      97         506 :   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
      98         506 :   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
      99             : 
     100         506 :   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
     101         506 :   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
     102             :   // We need to include these since trunc STORES to PRIVATE need
     103             :   // special handling to accommodate RMW
     104         506 :   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
     105         506 :   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
     106         506 :   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
     107         506 :   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
     108         506 :   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
     109         506 :   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
     110         506 :   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
     111         506 :   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
     112         506 :   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
     113         506 :   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
     114             : 
     115             :   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
     116         506 :   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
     117         506 :   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
     118             : 
     119             :   // Set condition code actions
     120         506 :   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     121         506 :   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     122         506 :   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     123         506 :   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     124         506 :   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     125         506 :   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     126         506 :   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     127         506 :   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     128         506 :   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     129         506 :   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     130         506 :   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     131         506 :   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     132             : 
     133         506 :   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     134         506 :   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     135         506 :   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     136         506 :   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     137             : 
     138         506 :   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     139         506 :   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     140             : 
     141         506 :   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     142         506 :   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     143             : 
     144         506 :   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     145         506 :   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     146         506 :   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     147             : 
     148         506 :   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     149             : 
     150         506 :   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     151         506 :   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     152             : 
     153         506 :   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     154         506 :   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     155         506 :   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     156         506 :   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
     157         506 :   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     158         506 :   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     159             : 
     160         506 :   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     161         506 :   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     162         506 :   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     163         506 :   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     164             : 
     165             :   // ADD, SUB overflow.
     166             :   // TODO: turn these into Legal?
     167         506 :   if (Subtarget->hasCARRY())
     168         468 :     setOperationAction(ISD::UADDO, MVT::i32, Custom);
     169             : 
     170         506 :   if (Subtarget->hasBORROW())
     171         468 :     setOperationAction(ISD::USUBO, MVT::i32, Custom);
     172             : 
     173             :   // Expand sign extension of vectors
     174         506 :   if (!Subtarget->hasBFE())
     175          38 :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
     176             : 
     177         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
     178         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
     179             : 
     180         506 :   if (!Subtarget->hasBFE())
     181          38 :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
     182         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
     183         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
     184             : 
     185         506 :   if (!Subtarget->hasBFE())
     186          38 :     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     187         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
     188         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
     189             : 
     190         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
     191         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
     192         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
     193             : 
     194         506 :   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
     195             : 
     196         506 :   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
     197             : 
     198         506 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
     199         506 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
     200         506 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     201         506 :   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     202             : 
     203         506 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
     204         506 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
     205         506 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
     206         506 :   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     207             : 
     208             :   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
     209             :   //  to be Legal/Custom in order to avoid library calls.
     210         506 :   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
     211         506 :   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
     212         506 :   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
     213             : 
     214         506 :   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     215             : 
     216         253 :   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
     217         759 :   for (MVT VT : ScalarIntVTs) {
     218        1012 :     setOperationAction(ISD::ADDC, VT, Expand);
     219        1012 :     setOperationAction(ISD::SUBC, VT, Expand);
     220        1012 :     setOperationAction(ISD::ADDE, VT, Expand);
     221        1012 :     setOperationAction(ISD::SUBE, VT, Expand);
     222             :   }
     223             : 
     224             :   // LLVM will expand these to atomic_cmp_swap(0)
     225             :   // and atomic_swap, respectively.
     226         506 :   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
     227         506 :   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
     228             : 
     229             :   // We need to custom lower some of the intrinsics
     230         506 :   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     231         506 :   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     232             : 
     233         506 :   setSchedulingPreference(Sched::Source);
     234             : 
     235         506 :   setTargetDAGCombine(ISD::FP_ROUND);
     236         506 :   setTargetDAGCombine(ISD::FP_TO_SINT);
     237         506 :   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     238         506 :   setTargetDAGCombine(ISD::SELECT_CC);
     239         506 :   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     240         506 :   setTargetDAGCombine(ISD::LOAD);
     241         253 : }
     242             : 
     243      403099 : const R600Subtarget *R600TargetLowering::getSubtarget() const {
     244      403099 :   return static_cast<const R600Subtarget *>(Subtarget);
     245             : }
     246             : 
     247        2351 : static inline bool isEOP(MachineBasicBlock::iterator I) {
     248        9404 :   if (std::next(I) == I->getParent()->end())
     249             :     return false;
     250        7029 :   return std::next(I)->getOpcode() == AMDGPU::RETURN;
     251             : }
     252             : 
     253             : MachineBasicBlock *
     254        8645 : R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     255             :                                                 MachineBasicBlock *BB) const {
     256        8645 :   MachineFunction *MF = BB->getParent();
     257        8645 :   MachineRegisterInfo &MRI = MF->getRegInfo();
     258        8645 :   MachineBasicBlock::iterator I = MI;
     259       17290 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
     260             : 
     261       17290 :   switch (MI.getOpcode()) {
     262         841 :   default:
     263             :     // Replace LDS_*_RET instruction that don't have any uses with the
     264             :     // equivalent LDS_*_NORET instruction.
     265         841 :     if (TII->isLDSRetInstr(MI.getOpcode())) {
     266        1682 :       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
     267             :       assert(DstIdx != -1);
     268         841 :       MachineInstrBuilder NewMI;
     269             :       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
     270             :       //        LDS_1A2D support and remove this special case.
     271        2553 :       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
     272          30 :           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
     273             :         return BB;
     274             : 
     275          60 :       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
     276          60 :                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
     277         300 :       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
     278         810 :         NewMI.add(MI.getOperand(i));
     279             :       }
     280             :     } else {
     281           0 :       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     282             :     }
     283             :     break;
     284           0 :   case AMDGPU::CLAMP_R600: {
     285           0 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     286           0 :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     287           0 :         MI.getOperand(1).getReg());
     288           0 :     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
     289           0 :     break;
     290             :   }
     291             : 
     292          20 :   case AMDGPU::FABS_R600: {
     293          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     294          20 :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     295          40 :         MI.getOperand(1).getReg());
     296          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     297          20 :     break;
     298             :   }
     299             : 
     300          20 :   case AMDGPU::FNEG_R600: {
     301          40 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     302          20 :         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
     303          40 :         MI.getOperand(1).getReg());
     304          20 :     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     305          20 :     break;
     306             :   }
     307             : 
     308           0 :   case AMDGPU::MASK_WRITE: {
     309           0 :     unsigned maskedRegister = MI.getOperand(0).getReg();
     310             :     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     311           0 :     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     312           0 :     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     313           0 :     break;
     314             :   }
     315             : 
     316          17 :   case AMDGPU::MOV_IMM_F32:
     317          34 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
     318             :                                                             .getFPImm()
     319          34 :                                                             ->getValueAPF()
     320          34 :                                                             .bitcastToAPInt()
     321             :                                                             .getZExtValue());
     322          17 :     break;
     323             : 
     324         452 :   case AMDGPU::MOV_IMM_I32:
     325         452 :     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
     326         452 :                      MI.getOperand(1).getImm());
     327         452 :     break;
     328             : 
     329           1 :   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
     330             :     //TODO: Perhaps combine this instruction with the next if possible
     331             :     auto MIB = TII->buildDefaultInstruction(
     332           2 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
     333           1 :     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
     334             :     //TODO: Ugh this is rather ugly
     335           2 :     MIB->getOperand(Idx) = MI.getOperand(1);
     336             :     break;
     337             :   }
     338             : 
     339        2673 :   case AMDGPU::CONST_COPY: {
     340        5346 :     MachineInstr *NewMI = TII->buildDefaultInstruction(
     341       10692 :         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
     342        2673 :     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
     343        2673 :                        MI.getOperand(1).getImm());
     344        2673 :     break;
     345             :   }
     346             : 
     347        2289 :   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
     348             :   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
     349             :   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
     350       11445 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     351        4578 :         .add(MI.getOperand(0))
     352        6867 :         .add(MI.getOperand(1))
     353        4578 :         .addImm(isEOP(I)); // Set End of program bit
     354        2289 :     break;
     355             : 
     356           2 :   case AMDGPU::RAT_STORE_TYPED_eg:
     357          10 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     358           4 :         .add(MI.getOperand(0))
     359           6 :         .add(MI.getOperand(1))
     360           6 :         .add(MI.getOperand(2))
     361           4 :         .addImm(isEOP(I)); // Set End of program bit
     362           2 :     break;
     363             : 
     364         131 :   case AMDGPU::BRANCH:
     365         655 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
     366         262 :         .add(MI.getOperand(0));
     367         131 :     break;
     368             : 
     369           0 :   case AMDGPU::BRANCH_COND_f32: {
     370             :     MachineInstr *NewMI =
     371           0 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     372           0 :                 AMDGPU::PREDICATE_BIT)
     373           0 :             .add(MI.getOperand(1))
     374           0 :             .addImm(AMDGPU::PRED_SETNE)
     375           0 :             .addImm(0); // Flags
     376           0 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     377           0 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     378           0 :         .add(MI.getOperand(0))
     379           0 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     380           0 :     break;
     381             :   }
     382             : 
     383          84 :   case AMDGPU::BRANCH_COND_i32: {
     384             :     MachineInstr *NewMI =
     385         252 :         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
     386         168 :                 AMDGPU::PREDICATE_BIT)
     387         252 :             .add(MI.getOperand(1))
     388          84 :             .addImm(AMDGPU::PRED_SETNE_INT)
     389          84 :             .addImm(0); // Flags
     390          84 :     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     391         420 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
     392         168 :         .add(MI.getOperand(0))
     393          84 :         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     394          84 :     break;
     395             :   }
     396             : 
     397          60 :   case AMDGPU::EG_ExportSwz:
     398             :   case AMDGPU::R600_ExportSwz: {
     399             :     // Instruction is left unmodified if its not the last one of its type
     400          60 :     bool isLastInstructionOfItsType = true;
     401          60 :     unsigned InstExportType = MI.getOperand(1).getImm();
     402         189 :     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
     403         249 :          EndBlock = BB->end(); NextExportInst != EndBlock;
     404             :          NextExportInst = std::next(NextExportInst)) {
     405         392 :       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
     406         116 :           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
     407          23 :         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
     408          23 :             .getImm();
     409          23 :         if (CurrentInstExportType == InstExportType) {
     410             :           isLastInstructionOfItsType = false;
     411             :           break;
     412             :         }
     413             :       }
     414             :     }
     415          60 :     bool EOP = isEOP(I);
     416          60 :     if (!EOP && !isLastInstructionOfItsType)
     417             :       return BB;
     418         102 :     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
     419         255 :     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
     420         102 :         .add(MI.getOperand(0))
     421         153 :         .add(MI.getOperand(1))
     422         153 :         .add(MI.getOperand(2))
     423         153 :         .add(MI.getOperand(3))
     424         153 :         .add(MI.getOperand(4))
     425         153 :         .add(MI.getOperand(5))
     426         153 :         .add(MI.getOperand(6))
     427         102 :         .addImm(CfInst)
     428         102 :         .addImm(EOP);
     429          51 :     break;
     430             :   }
     431             :   case AMDGPU::RETURN: {
     432             :     return BB;
     433             :   }
     434             :   }
     435             : 
     436        5770 :   MI.eraseFromParent();
     437        5770 :   return BB;
     438             : }
     439             : 
     440             : //===----------------------------------------------------------------------===//
     441             : // Custom DAG Lowering Operations
     442             : //===----------------------------------------------------------------------===//
     443             : 
     444      100053 : SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     445      100053 :   MachineFunction &MF = DAG.getMachineFunction();
     446      100053 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     447      200106 :   switch (Op.getOpcode()) {
     448         392 :   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     449       10736 :   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
     450           7 :   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
     451          50 :   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
     452          28 :   case ISD::SRA_PARTS:
     453          28 :   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
     454          60 :   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
     455         620 :   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
     456          17 :   case ISD::FCOS:
     457          17 :   case ISD::FSIN: return LowerTrig(Op, DAG);
     458       16266 :   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
     459       32093 :   case ISD::STORE: return LowerSTORE(Op, DAG);
     460       37462 :   case ISD::LOAD: {
     461       37462 :     SDValue Result = LowerLOAD(Op, DAG);
     462             :     assert((!Result.getNode() ||
     463             :             Result.getNode()->getNumValues() == 2) &&
     464             :            "Load should return a value and a chain");
     465       37462 :     return Result;
     466             :   }
     467             : 
     468          84 :   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
     469          38 :   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
     470        1570 :   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
     471          82 :   case ISD::INTRINSIC_VOID: {
     472         164 :     SDValue Chain = Op.getOperand(0);
     473             :     unsigned IntrinsicID =
     474         328 :                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     475          82 :     switch (IntrinsicID) {
     476          60 :     case AMDGPUIntrinsic::r600_store_swizzle: {
     477         120 :       SDLoc DL(Op);
     478             :       const SDValue Args[8] = {
     479             :         Chain,
     480         120 :         Op.getOperand(2), // Export Value
     481         120 :         Op.getOperand(3), // ArrayBase
     482         120 :         Op.getOperand(4), // Type
     483         120 :         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
     484         120 :         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
     485         120 :         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
     486         120 :         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
     487         720 :       };
     488         180 :       return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
     489             :     }
     490             : 
     491             :     // default for switch(IntrinsicID)
     492             :     default: break;
     493             :     }
     494             :     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
     495          22 :     break;
     496             :   }
     497         548 :   case ISD::INTRINSIC_WO_CHAIN: {
     498             :     unsigned IntrinsicID =
     499        2192 :                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     500        1096 :     EVT VT = Op.getValueType();
     501         548 :     SDLoc DL(Op);
     502         548 :     switch (IntrinsicID) {
     503         276 :     case AMDGPUIntrinsic::r600_tex:
     504             :     case AMDGPUIntrinsic::r600_texc: {
     505             :       unsigned TextureOp;
     506         276 :       switch (IntrinsicID) {
     507             :       case AMDGPUIntrinsic::r600_tex:
     508             :         TextureOp = 0;
     509             :         break;
     510           7 :       case AMDGPUIntrinsic::r600_texc:
     511           7 :         TextureOp = 1;
     512           7 :         break;
     513           0 :       default:
     514           0 :         llvm_unreachable("unhandled texture operation");
     515             :       }
     516             : 
     517             :       SDValue TexArgs[19] = {
     518         552 :         DAG.getConstant(TextureOp, DL, MVT::i32),
     519         552 :         Op.getOperand(1),
     520         552 :         DAG.getConstant(0, DL, MVT::i32),
     521         552 :         DAG.getConstant(1, DL, MVT::i32),
     522         552 :         DAG.getConstant(2, DL, MVT::i32),
     523         552 :         DAG.getConstant(3, DL, MVT::i32),
     524         552 :         Op.getOperand(2),
     525         552 :         Op.getOperand(3),
     526         552 :         Op.getOperand(4),
     527         552 :         DAG.getConstant(0, DL, MVT::i32),
     528         552 :         DAG.getConstant(1, DL, MVT::i32),
     529         552 :         DAG.getConstant(2, DL, MVT::i32),
     530         552 :         DAG.getConstant(3, DL, MVT::i32),
     531         552 :         Op.getOperand(5),
     532         552 :         Op.getOperand(6),
     533         552 :         Op.getOperand(7),
     534         552 :         Op.getOperand(8),
     535         552 :         Op.getOperand(9),
     536         552 :         Op.getOperand(10)
     537        7176 :       };
     538         828 :       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     539             :     }
     540          32 :     case AMDGPUIntrinsic::r600_dot4: {
     541             :       SDValue Args[8] = {
     542          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     543         128 :           DAG.getConstant(0, DL, MVT::i32)),
     544          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     545         128 :           DAG.getConstant(0, DL, MVT::i32)),
     546          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     547         128 :           DAG.getConstant(1, DL, MVT::i32)),
     548          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     549         128 :           DAG.getConstant(1, DL, MVT::i32)),
     550          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     551         128 :           DAG.getConstant(2, DL, MVT::i32)),
     552          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     553         128 :           DAG.getConstant(2, DL, MVT::i32)),
     554          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
     555         128 :           DAG.getConstant(3, DL, MVT::i32)),
     556          64 :       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
     557         128 :           DAG.getConstant(3, DL, MVT::i32))
     558         544 :       };
     559          96 :       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     560             :     }
     561             : 
     562           2 :     case Intrinsic::r600_implicitarg_ptr: {
     563           6 :       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
     564           2 :       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
     565           2 :       return DAG.getConstant(ByteOffset, DL, PtrVT);
     566             :     }
     567           1 :     case Intrinsic::r600_read_ngroups_x:
     568           1 :       return LowerImplicitParameter(DAG, VT, DL, 0);
     569           1 :     case Intrinsic::r600_read_ngroups_y:
     570           1 :       return LowerImplicitParameter(DAG, VT, DL, 1);
     571           1 :     case Intrinsic::r600_read_ngroups_z:
     572           1 :       return LowerImplicitParameter(DAG, VT, DL, 2);
     573           2 :     case Intrinsic::r600_read_global_size_x:
     574           2 :       return LowerImplicitParameter(DAG, VT, DL, 3);
     575           2 :     case Intrinsic::r600_read_global_size_y:
     576           2 :       return LowerImplicitParameter(DAG, VT, DL, 4);
     577           2 :     case Intrinsic::r600_read_global_size_z:
     578           2 :       return LowerImplicitParameter(DAG, VT, DL, 5);
     579           8 :     case Intrinsic::r600_read_local_size_x:
     580           8 :       return LowerImplicitParameter(DAG, VT, DL, 6);
     581          20 :     case Intrinsic::r600_read_local_size_y:
     582          20 :       return LowerImplicitParameter(DAG, VT, DL, 7);
     583          20 :     case Intrinsic::r600_read_local_size_z:
     584          20 :       return LowerImplicitParameter(DAG, VT, DL, 8);
     585             : 
     586           4 :     case Intrinsic::r600_read_tgid_x:
     587             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     588           4 :                                      AMDGPU::T1_X, VT);
     589           3 :     case Intrinsic::r600_read_tgid_y:
     590             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     591           3 :                                      AMDGPU::T1_Y, VT);
     592           3 :     case Intrinsic::r600_read_tgid_z:
     593             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     594           3 :                                      AMDGPU::T1_Z, VT);
     595         127 :     case Intrinsic::r600_read_tidig_x:
     596             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     597         127 :                                      AMDGPU::T0_X, VT);
     598          16 :     case Intrinsic::r600_read_tidig_y:
     599             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     600          16 :                                      AMDGPU::T0_Y, VT);
     601          16 :     case Intrinsic::r600_read_tidig_z:
     602             :       return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
     603          16 :                                      AMDGPU::T0_Z, VT);
     604             : 
     605           3 :     case Intrinsic::r600_recipsqrt_ieee:
     606           6 :       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
     607             : 
     608           5 :     case Intrinsic::r600_recipsqrt_clamped:
     609          10 :       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     610           4 :     default:
     611           4 :       return Op;
     612             :     }
     613             : 
     614             :     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     615             :     break;
     616             :   }
     617             :   } // end switch(Op.getOpcode())
     618          22 :   return SDValue();
     619             : }
     620             : 
     621         103 : void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     622             :                                             SmallVectorImpl<SDValue> &Results,
     623             :                                             SelectionDAG &DAG) const {
     624         206 :   switch (N->getOpcode()) {
     625          51 :   default:
     626          51 :     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     627          51 :     return;
     628           9 :   case ISD::FP_TO_UINT:
     629          25 :     if (N->getValueType(0) == MVT::i1) {
     630           4 :       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
     631           2 :       return;
     632             :     }
     633             :     // Since we don't care about out of bounds values we can use FP_TO_SINT for
     634             :     // uints too. The DAGLegalizer code for uint considers some extra cases
     635             :     // which are not necessary here.
     636             :     LLVM_FALLTHROUGH;
     637             :   case ISD::FP_TO_SINT: {
     638          48 :     if (N->getValueType(0) == MVT::i1) {
     639           4 :       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
     640           2 :       return;
     641             :     }
     642             : 
     643          14 :     SDValue Result;
     644          14 :     if (expandFP_TO_SINT(N, Result, DAG))
     645          14 :       Results.push_back(Result);
     646             :     return;
     647             :   }
     648          12 :   case ISD::SDIVREM: {
     649          12 :     SDValue Op = SDValue(N, 1);
     650          12 :     SDValue RES = LowerSDIVREM(Op, DAG);
     651          12 :     Results.push_back(RES);
     652          24 :     Results.push_back(RES.getValue(1));
     653             :     break;
     654             :   }
     655          22 :   case ISD::UDIVREM: {
     656          22 :     SDValue Op = SDValue(N, 0);
     657          22 :     LowerUDIVREM64(Op, DAG, Results);
     658             :     break;
     659             :   }
     660             :   }
     661             : }
     662             : 
     663          16 : SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
     664             :                                                    SDValue Vector) const {
     665          32 :   SDLoc DL(Vector);
     666          32 :   EVT VecVT = Vector.getValueType();
     667          16 :   EVT EltVT = VecVT.getVectorElementType();
     668          32 :   SmallVector<SDValue, 8> Args;
     669             : 
     670          64 :   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
     671          48 :     Args.push_back(DAG.getNode(
     672             :         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
     673         192 :         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
     674             :   }
     675             : 
     676          48 :   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
     677             : }
     678             : 
     679       10736 : SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     680             :                                                     SelectionDAG &DAG) const {
     681       21472 :   SDLoc DL(Op);
     682       21472 :   SDValue Vector = Op.getOperand(0);
     683       21472 :   SDValue Index = Op.getOperand(1);
     684             : 
     685          84 :   if (isa<ConstantSDNode>(Index) ||
     686          42 :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     687       10722 :     return Op;
     688             : 
     689          14 :   Vector = vectorToVerticalVector(DAG, Vector);
     690             :   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
     691          28 :                      Vector, Index);
     692             : }
     693             : 
     694           7 : SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     695             :                                                    SelectionDAG &DAG) const {
     696          14 :   SDLoc DL(Op);
     697          14 :   SDValue Vector = Op.getOperand(0);
     698          14 :   SDValue Value = Op.getOperand(1);
     699          14 :   SDValue Index = Op.getOperand(2);
     700             : 
     701           6 :   if (isa<ConstantSDNode>(Index) ||
     702           3 :       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     703           6 :     return Op;
     704             : 
     705           1 :   Vector = vectorToVerticalVector(DAG, Vector);
     706             :   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
     707           2 :                                Vector, Value, Index);
     708           1 :   return vectorToVerticalVector(DAG, Insert);
     709             : }
     710             : 
     711          38 : SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
     712             :                                                SDValue Op,
     713             :                                                SelectionDAG &DAG) const {
     714          38 :   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
     715          38 :   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     716          23 :     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
     717             : 
     718          30 :   const DataLayout &DL = DAG.getDataLayout();
     719          15 :   const GlobalValue *GV = GSD->getGlobal();
     720          30 :   MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
     721             : 
     722          60 :   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
     723          60 :   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
     724             : }
     725             : 
     726          17 : SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
     727             :   // On hw >= R700, COS/SIN input must be between -1. and 1.
     728             :   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
     729          34 :   EVT VT = Op.getValueType();
     730          34 :   SDValue Arg = Op.getOperand(0);
     731          34 :   SDLoc DL(Op);
     732             : 
     733             :   // TODO: Should this propagate fast-math-flags?
     734             :   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
     735             :       DAG.getNode(ISD::FADD, DL, VT,
     736             :         DAG.getNode(ISD::FMUL, DL, VT, Arg,
     737          17 :           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
     738          85 :         DAG.getConstantFP(0.5, DL, MVT::f32)));
     739             :   unsigned TrigNode;
     740          34 :   switch (Op.getOpcode()) {
     741             :   case ISD::FCOS:
     742             :     TrigNode = AMDGPUISD::COS_HW;
     743             :     break;
     744          11 :   case ISD::FSIN:
     745          11 :     TrigNode = AMDGPUISD::SIN_HW;
     746          11 :     break;
     747           0 :   default:
     748           0 :     llvm_unreachable("Wrong trig opcode");
     749             :   }
     750             :   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
     751             :       DAG.getNode(ISD::FADD, DL, VT, FractPart,
     752          51 :         DAG.getConstantFP(-0.5, DL, MVT::f32)));
     753          17 :   if (Gen >= R600Subtarget::R700)
     754          17 :     return TrigVal;
     755             :   // On R600 hw, COS/SIN input must be between -Pi and Pi.
     756             :   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
     757           0 :       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
     758             : }
     759             : 
     760          50 : SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
     761         100 :   SDLoc DL(Op);
     762         100 :   EVT VT = Op.getValueType();
     763             : 
     764         100 :   SDValue Lo = Op.getOperand(0);
     765         100 :   SDValue Hi = Op.getOperand(1);
     766         100 :   SDValue Shift = Op.getOperand(2);
     767          50 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     768          50 :   SDValue One  = DAG.getConstant(1, DL, VT);
     769             : 
     770          50 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     771          50 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     772          50 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     773          50 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     774             : 
     775             :   // The dance around Width1 is necessary for 0 special case.
     776             :   // Without it the CompShift might be 32, producing incorrect results in
     777             :   // Overflow. So we do the shift in two steps, the alternative is to
     778             :   // add a conditional to filter the special case.
     779             : 
     780          50 :   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
     781          50 :   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
     782             : 
     783          50 :   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
     784          50 :   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
     785          50 :   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
     786             : 
     787          50 :   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
     788          50 :   SDValue LoBig = Zero;
     789             : 
     790          50 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     791          50 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     792             : 
     793         100 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     794             : }
     795             : 
     796          28 : SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
     797          56 :   SDLoc DL(Op);
     798          56 :   EVT VT = Op.getValueType();
     799             : 
     800          56 :   SDValue Lo = Op.getOperand(0);
     801          56 :   SDValue Hi = Op.getOperand(1);
     802          56 :   SDValue Shift = Op.getOperand(2);
     803          28 :   SDValue Zero = DAG.getConstant(0, DL, VT);
     804          28 :   SDValue One  = DAG.getConstant(1, DL, VT);
     805             : 
     806          56 :   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
     807             : 
     808          28 :   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
     809          28 :   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     810          28 :   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
     811          28 :   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
     812             : 
     813             :   // The dance around Width1 is necessary for 0 special case.
     814             :   // Without it the CompShift might be 32, producing incorrect results in
     815             :   // Overflow. So we do the shift in two steps, the alternative is to
     816             :   // add a conditional to filter the special case.
     817             : 
     818          28 :   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
     819          28 :   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
     820             : 
     821          28 :   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
     822          28 :   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
     823          28 :   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
     824             : 
     825          28 :   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
     826          35 :   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
     827             : 
     828          28 :   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
     829          28 :   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
     830             : 
     831          56 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
     832             : }
     833             : 
     834         680 : SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
     835             :                                           unsigned mainop, unsigned ovf) const {
     836        1360 :   SDLoc DL(Op);
     837        1360 :   EVT VT = Op.getValueType();
     838             : 
     839        1360 :   SDValue Lo = Op.getOperand(0);
     840        1360 :   SDValue Hi = Op.getOperand(1);
     841             : 
     842         680 :   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
     843             :   // Extend sign.
     844         680 :   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
     845        2040 :                     DAG.getValueType(MVT::i1));
     846             : 
     847         680 :   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
     848             : 
     849        1360 :   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
     850             : }
     851             : 
     852           2 : SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
     853           4 :   SDLoc DL(Op);
     854             :   return DAG.getNode(
     855             :       ISD::SETCC,
     856             :       DL,
     857             :       MVT::i1,
     858           2 :       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
     859           8 :       DAG.getCondCode(ISD::SETEQ));
     860             : }
     861             : 
     862           2 : SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
     863           4 :   SDLoc DL(Op);
     864             :   return DAG.getNode(
     865             :       ISD::SETCC,
     866             :       DL,
     867             :       MVT::i1,
     868           2 :       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
     869           8 :       DAG.getCondCode(ISD::SETEQ));
     870             : }
     871             : 
     872          57 : SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
     873             :                                                    const SDLoc &DL,
     874             :                                                    unsigned DwordOffset) const {
     875          57 :   unsigned ByteOffset = DwordOffset * 4;
     876          57 :   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
     877          57 :                                       AMDGPUASI.CONSTANT_BUFFER_0);
     878             : 
     879             :   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
     880             :   assert(isInt<16>(ByteOffset));
     881             : 
     882             :   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
     883          57 :                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
     884         228 :                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
     885             : }
     886             : 
     887       21096 : bool R600TargetLowering::isZero(SDValue Op) const {
     888        9690 :   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     889             :     return Cst->isNullValue();
     890         220 :   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
     891         220 :     return CstFP->isZero();
     892             :   } else {
     893             :     return false;
     894             :   }
     895             : }
     896             : 
     897       32502 : bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
     898         462 :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     899         462 :     return CFP->isExactlyValue(1.0);
     900             :   }
     901       32040 :   return isAllOnesConstant(Op);
     902             : }
     903             : 
     904        5819 : bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
     905         155 :   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     906         310 :     return CFP->getValueAPF().isZero();
     907             :   }
     908        5664 :   return isNullConstant(Op);
     909             : }
     910             : 
     911       16266 : SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     912       32532 :   SDLoc DL(Op);
     913       32532 :   EVT VT = Op.getValueType();
     914             : 
     915       32532 :   SDValue LHS = Op.getOperand(0);
     916       32532 :   SDValue RHS = Op.getOperand(1);
     917       32532 :   SDValue True = Op.getOperand(2);
     918       32532 :   SDValue False = Op.getOperand(3);
     919       32532 :   SDValue CC = Op.getOperand(4);
     920       16266 :   SDValue Temp;
     921             : 
     922       16639 :   if (VT == MVT::f32) {
     923         373 :     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
     924         373 :     SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     925         373 :     if (MinMax)
     926          19 :       return MinMax;
     927             :   }
     928             : 
     929             :   // LHS and RHS are guaranteed to be the same value type
     930       32494 :   EVT CompareVT = LHS.getValueType();
     931             : 
     932             :   // Check if we can lower this to a native operation.
     933             : 
     934             :   // Try to lower to a SET* instruction:
     935             :   //
     936             :   // SET* can match the following patterns:
     937             :   //
     938             :   // select_cc f32, f32, -1,  0, cc_supported
     939             :   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
     940             :   // select_cc i32, i32, -1,  0, cc_supported
     941             :   //
     942             : 
     943             :   // Move hardware True/False values to the correct operand.
     944       16247 :   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     945             :   ISD::CondCode InverseCC =
     946       32494 :      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
     947       16247 :   if (isHWTrueValue(False) && isHWFalseValue(True)) {
     948         153 :     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
     949          11 :       std::swap(False, True);
     950          11 :       CC = DAG.getCondCode(InverseCC);
     951             :     } else {
     952          71 :       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
     953          71 :       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
     954          60 :         std::swap(False, True);
     955          60 :         std::swap(LHS, RHS);
     956          60 :         CC = DAG.getCondCode(SwapInvCC);
     957             :       }
     958             :     }
     959             :   }
     960             : 
     961       16247 :   if (isHWTrueValue(True) && isHWFalseValue(False) &&
     962        5892 :       (CompareVT == VT || VT == MVT::i32)) {
     963             :     // This can be matched by a SET* instruction.
     964        5699 :     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
     965             :   }
     966             : 
     967             :   // Try to lower to a CND* instruction:
     968             :   //
     969             :   // CND* can match the following patterns:
     970             :   //
     971             :   // select_cc f32, 0.0, f32, f32, cc_supported
     972             :   // select_cc f32, 0.0, i32, i32, cc_supported
     973             :   // select_cc i32, 0,   f32, f32, cc_supported
     974             :   // select_cc i32, 0,   i32, i32, cc_supported
     975             :   //
     976             : 
     977             :   // Try to move the zero value to the RHS
     978       10548 :   if (isZero(LHS)) {
     979           3 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     980             :     // Try swapping the operands
     981           3 :     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
     982           6 :     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     983           0 :       std::swap(LHS, RHS);
     984           0 :       CC = DAG.getCondCode(CCSwapped);
     985             :     } else {
     986             :       // Try inverting the conditon and then swapping the operands
     987           3 :       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
     988           3 :       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
     989           3 :       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
     990           0 :         std::swap(True, False);
     991           0 :         std::swap(LHS, RHS);
     992           0 :         CC = DAG.getCondCode(CCSwapped);
     993             :       }
     994             :     }
     995             :   }
     996       10548 :   if (isZero(RHS)) {
     997        9677 :     SDValue Cond = LHS;
     998        9677 :     SDValue Zero = RHS;
     999        9677 :     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    1000        9677 :     if (CompareVT != VT) {
    1001             :       // Bitcast True / False to the correct types.  This will end up being
    1002             :       // a nop, but it allows us to define only a single pattern in the
    1003             :       // .TD files for each CND* instruction rather than having to have
    1004             :       // one pattern for integer True/False and one for fp True/False
    1005          49 :       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
    1006          49 :       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
    1007             :     }
    1008             : 
    1009             :     switch (CCOpcode) {
    1010        2130 :     case ISD::SETONE:
    1011             :     case ISD::SETUNE:
    1012             :     case ISD::SETNE:
    1013        4260 :       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
    1014        2130 :       Temp = True;
    1015        2130 :       True = False;
    1016        2130 :       False = Temp;
    1017        2130 :       break;
    1018             :     default:
    1019             :       break;
    1020             :     }
    1021             :     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
    1022             :         Cond, Zero,
    1023             :         True, False,
    1024        9677 :         DAG.getCondCode(CCOpcode));
    1025        9677 :     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
    1026             :   }
    1027             : 
    1028             :   // If we make it this for it means we have no native instructions to handle
    1029             :   // this SELECT_CC, so we must lower it.
    1030         871 :   SDValue HWTrue, HWFalse;
    1031             : 
    1032        1742 :   if (CompareVT == MVT::f32) {
    1033          60 :     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
    1034          60 :     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
    1035        1622 :   } else if (CompareVT == MVT::i32) {
    1036         811 :     HWTrue = DAG.getConstant(-1, DL, CompareVT);
    1037         811 :     HWFalse = DAG.getConstant(0, DL, CompareVT);
    1038             :   }
    1039             :   else {
    1040           0 :     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
    1041             :   }
    1042             : 
    1043             :   // Lower this unsupported SELECT_CC into a combination of two supported
    1044             :   // SELECT_CC operations.
    1045         871 :   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
    1046             : 
    1047             :   return DAG.getNode(ISD::SELECT_CC, DL, VT,
    1048             :       Cond, HWFalse,
    1049             :       True, False,
    1050         871 :       DAG.getCondCode(ISD::SETNE));
    1051             : }
    1052             : 
    1053             : /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
    1054             : /// convert these pointers to a register index.  Each register holds
    1055             : /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
    1056             : /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
    1057             : /// for indirect addressing.
    1058           0 : SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
    1059             :                                                unsigned StackWidth,
    1060             :                                                SelectionDAG &DAG) const {
    1061             :   unsigned SRLPad;
    1062           0 :   switch(StackWidth) {
    1063             :   case 1:
    1064             :     SRLPad = 2;
    1065             :     break;
    1066           0 :   case 2:
    1067           0 :     SRLPad = 3;
    1068           0 :     break;
    1069           0 :   case 4:
    1070           0 :     SRLPad = 4;
    1071           0 :     break;
    1072           0 :   default: llvm_unreachable("Invalid stack width");
    1073             :   }
    1074             : 
    1075           0 :   SDLoc DL(Ptr);
    1076             :   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
    1077           0 :                      DAG.getConstant(SRLPad, DL, MVT::i32));
    1078             : }
    1079             : 
    1080           0 : void R600TargetLowering::getStackAddress(unsigned StackWidth,
    1081             :                                          unsigned ElemIdx,
    1082             :                                          unsigned &Channel,
    1083             :                                          unsigned &PtrIncr) const {
    1084           0 :   switch (StackWidth) {
    1085           0 :   default:
    1086             :   case 1:
    1087           0 :     Channel = 0;
    1088           0 :     if (ElemIdx > 0) {
    1089           0 :       PtrIncr = 1;
    1090             :     } else {
    1091           0 :       PtrIncr = 0;
    1092             :     }
    1093             :     break;
    1094           0 :   case 2:
    1095           0 :     Channel = ElemIdx % 2;
    1096           0 :     if (ElemIdx == 2) {
    1097           0 :       PtrIncr = 1;
    1098             :     } else {
    1099           0 :       PtrIncr = 0;
    1100             :     }
    1101             :     break;
    1102           0 :   case 4:
    1103           0 :     Channel = ElemIdx;
    1104           0 :     PtrIncr = 0;
    1105           0 :     break;
    1106             :   }
    1107           0 : }
    1108             : 
    1109        1276 : SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
    1110             :                                                    SelectionDAG &DAG) const {
    1111        2552 :   SDLoc DL(Store);
    1112             :   //TODO: Who creates the i8 stores?
    1113             :   assert(Store->isTruncatingStore()
    1114             :          || Store->getValue().getValueType() == MVT::i8);
    1115             :   assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
    1116             : 
    1117        1276 :   SDValue Mask;
    1118        2552 :   if (Store->getMemoryVT() == MVT::i8) {
    1119             :     assert(Store->getAlignment() >= 1);
    1120         584 :     Mask = DAG.getConstant(0xff, DL, MVT::i32);
    1121        1384 :   } else if (Store->getMemoryVT() == MVT::i16) {
    1122             :     assert(Store->getAlignment() >= 2);
    1123         692 :     Mask = DAG.getConstant(0xffff, DL, MVT::i32);
    1124             :   } else {
    1125           0 :     llvm_unreachable("Unsupported private trunc store");
    1126             :   }
    1127             : 
    1128        2552 :   SDValue OldChain = Store->getChain();
    1129        2552 :   bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
    1130             :   // Skip dummy
    1131        2114 :   SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
    1132        1276 :   SDValue BasePtr = Store->getBasePtr();
    1133        1276 :   SDValue Offset = Store->getOffset();
    1134        1276 :   EVT MemVT = Store->getMemoryVT();
    1135             : 
    1136        1276 :   SDValue LoadPtr = BasePtr;
    1137        2552 :   if (!Offset.isUndef()) {
    1138           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1139             :   }
    1140             : 
    1141             :   // Get dword location
    1142             :   // TODO: this should be eliminated by the future SHR ptr, 2
    1143             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1144        3828 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1145             : 
    1146             :   // Load dword
    1147             :   // TODO: can we be smarter about machine pointer info?
    1148        3828 :   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
    1149             : 
    1150        2552 :   Chain = Dst.getValue(1);
    1151             : 
    1152             :   // Get offset in dword
    1153             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1154        3828 :                                 DAG.getConstant(0x3, DL, MVT::i32));
    1155             : 
    1156             :   // Convert byte offset to bit shift
    1157             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1158        3828 :                                  DAG.getConstant(3, DL, MVT::i32));
    1159             : 
    1160             :   // TODO: Contrary to the name of the functiom,
    1161             :   // it also handles sub i32 non-truncating stores (like i1)
    1162             :   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
    1163        3828 :                                   Store->getValue());
    1164             : 
    1165             :   // Mask the value to the right type
    1166        1276 :   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
    1167             : 
    1168             :   // Shift the value in place
    1169             :   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
    1170        2552 :                                      MaskedValue, ShiftAmt);
    1171             : 
    1172             :   // Shift the mask in place
    1173        2552 :   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
    1174             : 
    1175             :   // Invert the mask. NOTE: if we had native ROL instructions we could
    1176             :   // use inverted mask
    1177        1276 :   DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
    1178             : 
    1179             :   // Cleanup the target bits
    1180        2552 :   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
    1181             : 
    1182             :   // Add the new bits
    1183        2552 :   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
    1184             : 
    1185             :   // Store dword
    1186             :   // TODO: Can we be smarter about MachinePointerInfo?
    1187        2552 :   SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
    1188             : 
    1189             :   // If we are part of expanded vector, make our neighbors depend on this store
    1190        1276 :   if (VectorTrunc) {
    1191             :     // Make all other vector elements depend on this store
    1192        1676 :     Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
    1193         838 :     DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
    1194             :   }
    1195        2552 :   return NewStore;
    1196             : }
    1197             : 
    1198       32093 : SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    1199       32093 :   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
    1200       64186 :   unsigned AS = StoreNode->getAddressSpace();
    1201             : 
    1202       64186 :   SDValue Chain = StoreNode->getChain();
    1203       32093 :   SDValue Ptr = StoreNode->getBasePtr();
    1204       32093 :   SDValue Value = StoreNode->getValue();
    1205             : 
    1206       64186 :   EVT VT = Value.getValueType();
    1207       32093 :   EVT MemVT = StoreNode->getMemoryVT();
    1208       64186 :   EVT PtrVT = Ptr.getValueType();
    1209             : 
    1210       64186 :   SDLoc DL(Op);
    1211             : 
    1212             :   // Neither LOCAL nor PRIVATE can do vectors at the moment
    1213       55495 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1214       23402 :       VT.isVector()) {
    1215        1189 :     if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
    1216             :          StoreNode->isTruncatingStore()) {
    1217             :       // Add an extra level of chain to isolate this vector
    1218         470 :       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
    1219             :       // TODO: can the chain be replaced without creating a new store?
    1220             :       SDValue NewStore = DAG.getTruncStore(
    1221         235 :           NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
    1222             :           MemVT, StoreNode->getAlignment(),
    1223         705 :           StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
    1224         235 :       StoreNode = cast<StoreSDNode>(NewStore);
    1225             :     }
    1226             : 
    1227         904 :     return scalarizeVectorStore(StoreNode, DAG);
    1228             :   }
    1229             : 
    1230       62378 :   unsigned Align = StoreNode->getAlignment();
    1231       31406 :   if (Align < MemVT.getStoreSize() &&
    1232         217 :       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
    1233          24 :     return expandUnalignedStore(StoreNode, DAG);
    1234             :   }
    1235             : 
    1236             :   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
    1237       31165 :                                   DAG.getConstant(2, DL, PtrVT));
    1238             : 
    1239       31165 :   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
    1240             :     // It is beneficial to create MSKOR here instead of combiner to avoid
    1241             :     // artificial dependencies introduced by RMW
    1242        8685 :     if (StoreNode->isTruncatingStore()) {
    1243             :       assert(VT.bitsLE(MVT::i32));
    1244         205 :       SDValue MaskConstant;
    1245         410 :       if (MemVT == MVT::i8) {
    1246         120 :         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
    1247             :       } else {
    1248             :         assert(MemVT == MVT::i16);
    1249             :         assert(StoreNode->getAlignment() >= 2);
    1250          85 :         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
    1251             :       }
    1252             : 
    1253             :       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
    1254         205 :                                       DAG.getConstant(0x00000003, DL, PtrVT));
    1255             :       SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
    1256         205 :                                      DAG.getConstant(3, DL, VT));
    1257             : 
    1258             :       // Put the mask in correct place
    1259         205 :       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
    1260             : 
    1261             :       // Put the value bits in correct place
    1262         205 :       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
    1263         205 :       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
    1264             : 
    1265             :       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
    1266             :       // vector instead.
    1267             :       SDValue Src[4] = {
    1268             :         ShiftedValue,
    1269         410 :         DAG.getConstant(0, DL, MVT::i32),
    1270         410 :         DAG.getConstant(0, DL, MVT::i32),
    1271             :         Mask
    1272        1025 :       };
    1273         410 :       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
    1274         205 :       SDValue Args[3] = { Chain, Input, DWordAddr };
    1275             :       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
    1276             :                                      Op->getVTList(), Args, MemVT,
    1277         615 :                                      StoreNode->getMemOperand());
    1278       11242 :     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
    1279             :       // Convert pointer from byte address to dword address.
    1280        2730 :       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1281             : 
    1282        5460 :       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
    1283           0 :         llvm_unreachable("Truncated and indexed stores not supported yet");
    1284             :       } else {
    1285        2730 :         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1286             :       }
    1287        2730 :       return Chain;
    1288             :     }
    1289             :   }
    1290             : 
    1291             :   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
    1292       28230 :   if (AS != AMDGPUASI.PRIVATE_ADDRESS)
    1293       18017 :     return SDValue();
    1294             : 
    1295       10213 :   if (MemVT.bitsLT(MVT::i32))
    1296        1276 :     return lowerPrivateTruncStore(StoreNode, DAG);
    1297             : 
    1298             :   // Standard i32+ store, tag it with DWORDADDR to note that the address
    1299             :   // has been shifted
    1300       17874 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1301        2672 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
    1302        2672 :     return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
    1303             :   }
    1304             : 
    1305             :   // Tagged i32+ stores will be matched by patterns
    1306        6265 :   return SDValue();
    1307             : }
    1308             : 
    1309             : // return (512 + (kc_bank << 12)
    1310             : static int
    1311             : ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) {
    1312             :   switch (AddressSpace) {
    1313             :   case AMDGPUASI.CONSTANT_BUFFER_0:
    1314             :     return 512;
    1315             :   case AMDGPUASI.CONSTANT_BUFFER_1:
    1316             :     return 512 + 4096;
    1317             :   case AMDGPUASI.CONSTANT_BUFFER_2:
    1318             :     return 512 + 4096 * 2;
    1319             :   case AMDGPUASI.CONSTANT_BUFFER_3:
    1320             :     return 512 + 4096 * 3;
    1321             :   case AMDGPUASI.CONSTANT_BUFFER_4:
    1322             :     return 512 + 4096 * 4;
    1323             :   case AMDGPUASI.CONSTANT_BUFFER_5:
    1324             :     return 512 + 4096 * 5;
    1325             :   case AMDGPUASI.CONSTANT_BUFFER_6:
    1326             :     return 512 + 4096 * 6;
    1327             :   case AMDGPUASI.CONSTANT_BUFFER_7:
    1328             :     return 512 + 4096 * 7;
    1329             :   case AMDGPUASI.CONSTANT_BUFFER_8:
    1330             :     return 512 + 4096 * 8;
    1331             :   case AMDGPUASI.CONSTANT_BUFFER_9:
    1332             :     return 512 + 4096 * 9;
    1333             :   case AMDGPUASI.CONSTANT_BUFFER_10:
    1334             :     return 512 + 4096 * 10;
    1335             :   case AMDGPUASI.CONSTANT_BUFFER_11:
    1336             :     return 512 + 4096 * 11;
    1337             :   case AMDGPUASI.CONSTANT_BUFFER_12:
    1338             :     return 512 + 4096 * 12;
    1339             :   case AMDGPUASI.CONSTANT_BUFFER_13:
    1340             :     return 512 + 4096 * 13;
    1341             :   case AMDGPUASI.CONSTANT_BUFFER_14:
    1342             :     return 512 + 4096 * 14;
    1343             :   case AMDGPUASI.CONSTANT_BUFFER_15:
    1344             :     return 512 + 4096 * 15;
    1345             :   default:
    1346             :     return -1;
    1347             :   }
    1348             : }
    1349             : 
    1350        4021 : SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
    1351             :                                                 SelectionDAG &DAG) const {
    1352        8042 :   SDLoc DL(Op);
    1353        4021 :   LoadSDNode *Load = cast<LoadSDNode>(Op);
    1354        4021 :   ISD::LoadExtType ExtType = Load->getExtensionType();
    1355        4021 :   EVT MemVT = Load->getMemoryVT();
    1356             :   assert(Load->getAlignment() >= MemVT.getStoreSize());
    1357             : 
    1358        4021 :   SDValue BasePtr = Load->getBasePtr();
    1359        8042 :   SDValue Chain = Load->getChain();
    1360        4021 :   SDValue Offset = Load->getOffset();
    1361             : 
    1362        4021 :   SDValue LoadPtr = BasePtr;
    1363        8042 :   if (!Offset.isUndef()) {
    1364           0 :     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
    1365             :   }
    1366             : 
    1367             :   // Get dword location
    1368             :   // NOTE: this should be eliminated by the future SHR ptr, 2
    1369             :   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
    1370       12063 :                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
    1371             : 
    1372             :   // Load dword
    1373             :   // TODO: can we be smarter about machine pointer info?
    1374       12063 :   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
    1375             : 
    1376             :   // Get offset within the register.
    1377             :   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
    1378       12063 :                                 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
    1379             : 
    1380             :   // Bit offset of target byte (byteIdx * 8).
    1381             :   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
    1382       12063 :                                  DAG.getConstant(3, DL, MVT::i32));
    1383             : 
    1384             :   // Shift to the right.
    1385        8042 :   SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
    1386             : 
    1387             :   // Eliminate the upper bits by setting them to ...
    1388        4021 :   EVT MemEltVT = MemVT.getScalarType();
    1389             : 
    1390        4021 :   if (ExtType == ISD::SEXTLOAD) { // ... ones.
    1391        1279 :     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
    1392        2558 :     Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
    1393             :   } else { // ... or zeros.
    1394        2742 :     Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
    1395             :   }
    1396             : 
    1397             :   SDValue Ops[] = {
    1398             :     Ret,
    1399             :     Read.getValue(1) // This should be our output chain
    1400        8042 :   };
    1401             : 
    1402        8042 :   return DAG.getMergeValues(Ops, DL);
    1403             : }
    1404             : 
    1405       37462 : SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    1406       37462 :   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
    1407       74924 :   unsigned AS = LoadNode->getAddressSpace();
    1408       37462 :   EVT MemVT = LoadNode->getMemoryVT();
    1409       37462 :   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
    1410             : 
    1411       62341 :   if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
    1412       74924 :       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
    1413        4021 :     return lowerPrivateExtLoad(Op, DAG);
    1414             :   }
    1415             : 
    1416       33441 :   SDLoc DL(Op);
    1417       66882 :   EVT VT = Op.getValueType();
    1418       66882 :   SDValue Chain = LoadNode->getChain();
    1419       33441 :   SDValue Ptr = LoadNode->getBasePtr();
    1420             : 
    1421       96027 :   if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
    1422       87740 :       LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
    1423       25154 :       VT.isVector()) {
    1424         338 :       return scalarizeVectorLoad(LoadNode, DAG);
    1425             :   }
    1426             : 
    1427       66206 :   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(),
    1428       38787 :       AMDGPUASI);
    1429       11368 :   if (ConstantBlock > -1 &&
    1430        6646 :       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
    1431         962 :        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
    1432        4729 :     SDValue Result;
    1433        9458 :     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
    1434        9206 :         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
    1435           0 :         isa<ConstantSDNode>(Ptr)) {
    1436        4729 :       SDValue Slots[4];
    1437       23645 :       for (unsigned i = 0; i < 4; i++) {
    1438             :         // We want Const position encoded with the following formula :
    1439             :         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
    1440             :         // const_index is Ptr computed by llvm using an alignment of 16.
    1441             :         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
    1442             :         // then div by 4 at the ISel step
    1443             :         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
    1444       56748 :             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
    1445       37832 :         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
    1446             :       }
    1447        4729 :       EVT NewVT = MVT::v4i32;
    1448        4729 :       unsigned NumElements = 4;
    1449        4729 :       if (VT.isVector()) {
    1450         533 :         NewVT = VT;
    1451         533 :         NumElements = VT.getVectorNumElements();
    1452             :       }
    1453        9458 :       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
    1454             :     } else {
    1455             :       // non-constant ptr can't be folded, keeps it as a v4f32 load
    1456           0 :       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
    1457             :           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
    1458           0 :                       DAG.getConstant(4, DL, MVT::i32)),
    1459           0 :                       DAG.getConstant(LoadNode->getAddressSpace() -
    1460           0 :                                       AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
    1461           0 :           );
    1462             :     }
    1463             : 
    1464        4729 :     if (!VT.isVector()) {
    1465        4196 :       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
    1466       16784 :                            DAG.getConstant(0, DL, MVT::i32));
    1467             :     }
    1468             : 
    1469             :     SDValue MergedValues[2] = {
    1470             :       Result,
    1471             :       Chain
    1472        4729 :     };
    1473        4729 :     return DAG.getMergeValues(MergedValues, DL);
    1474             :   }
    1475             : 
    1476             :   // For most operations returning SDValue() will result in the node being
    1477             :   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
    1478             :   // need to manually expand loads that may be legal in some address spaces and
    1479             :   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
    1480             :   // compute shaders, since the data is sign extended when it is uploaded to the
    1481             :   // buffer. However SEXT loads from other address spaces are not supported, so
    1482             :   // we need to expand them here.
    1483       28374 :   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
    1484         285 :     EVT MemVT = LoadNode->getMemoryVT();
    1485             :     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
    1486             :     SDValue NewLoad = DAG.getExtLoad(
    1487         285 :         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
    1488         570 :         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
    1489             :     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
    1490         285 :                               DAG.getValueType(MemVT));
    1491             : 
    1492         285 :     SDValue MergedValues[2] = { Res, Chain };
    1493         285 :     return DAG.getMergeValues(MergedValues, DL);
    1494             :   }
    1495             : 
    1496       56178 :   if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
    1497        7387 :     return SDValue();
    1498             :   }
    1499             : 
    1500             :   // DWORDADDR ISD marks already shifted address
    1501       41404 :   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
    1502             :     assert(VT == MVT::i32);
    1503       17169 :     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
    1504       11446 :     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
    1505       11446 :     return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
    1506             :   }
    1507       14979 :   return SDValue();
    1508             : }
    1509             : 
    1510          84 : SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    1511         168 :   SDValue Chain = Op.getOperand(0);
    1512         168 :   SDValue Cond  = Op.getOperand(1);
    1513         168 :   SDValue Jump  = Op.getOperand(2);
    1514             : 
    1515         168 :   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
    1516         336 :                      Chain, Jump, Cond);
    1517             : }
    1518             : 
    1519        1570 : SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
    1520             :                                             SelectionDAG &DAG) const {
    1521        1570 :   MachineFunction &MF = DAG.getMachineFunction();
    1522        3140 :   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
    1523             : 
    1524        1570 :   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
    1525             : 
    1526        1570 :   unsigned FrameIndex = FIN->getIndex();
    1527             :   unsigned IgnoredFrameReg;
    1528             :   unsigned Offset =
    1529        1570 :     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
    1530        3140 :   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
    1531        6280 :                          Op.getValueType());
    1532             : }
    1533             : 
    1534             : /// XXX Only kernel functions are supported, so we can assume for now that
    1535             : /// every function is a kernel function, but in the future we should use
    1536             : /// separate calling conventions for kernel and non-kernel functions.
    1537        2058 : SDValue R600TargetLowering::LowerFormalArguments(
    1538             :     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    1539             :     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    1540             :     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    1541        4116 :   SmallVector<CCValAssign, 16> ArgLocs;
    1542             :   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
    1543        4116 :                  *DAG.getContext());
    1544        2058 :   MachineFunction &MF = DAG.getMachineFunction();
    1545        2058 :   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    1546             : 
    1547        4116 :   SmallVector<ISD::InputArg, 8> LocalIns;
    1548             : 
    1549        2058 :   if (AMDGPU::isShader(CallConv)) {
    1550          47 :     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
    1551             :   } else {
    1552        2011 :     analyzeFormalArgumentsCompute(CCInfo, Ins);
    1553             :   }
    1554             : 
    1555        9328 :   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
    1556       10424 :     CCValAssign &VA = ArgLocs[i];
    1557       10424 :     const ISD::InputArg &In = Ins[i];
    1558        5212 :     EVT VT = In.VT;
    1559       10424 :     EVT MemVT = VA.getLocVT();
    1560       10073 :     if (!VT.isVector() && MemVT.isVector()) {
    1561             :       // Get load source type if scalarized.
    1562           0 :       MemVT = MemVT.getVectorElementType();
    1563             :     }
    1564             : 
    1565        5212 :     if (AMDGPU::isShader(CallConv)) {
    1566          63 :       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
    1567          63 :       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    1568          63 :       InVals.push_back(Register);
    1569          63 :       continue;
    1570             :     }
    1571             : 
    1572        5149 :     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
    1573        5149 :                                           AMDGPUASI.CONSTANT_BUFFER_0);
    1574             : 
    1575             :     // i64 isn't a legal type, so the register type used ends up as i32, which
    1576             :     // isn't expected here. It attempts to create this sextload, but it ends up
    1577             :     // being invalid. Somehow this seems to work with i64 arguments, but breaks
    1578             :     // for <1 x i64>.
    1579             : 
    1580             :     // The first 36 bytes of the input buffer contains information about
    1581             :     // thread group and global sizes.
    1582        5149 :     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
    1583       10298 :     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
    1584             :       // FIXME: This should really check the extload type, but the handling of
    1585             :       // extload vector parameters seems to be broken.
    1586             : 
    1587             :       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
    1588         182 :       Ext = ISD::SEXTLOAD;
    1589             :     }
    1590             : 
    1591             :     // Compute the offset from the value.
    1592             :     // XXX - I think PartOffset should give you this, but it seems to give the
    1593             :     // size of the register which isn't useful.
    1594             : 
    1595       10298 :     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
    1596        5149 :     unsigned PartOffset = VA.getLocMemOffset();
    1597       10298 :     unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
    1598             : 
    1599       10298 :     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
    1600             :     SDValue Arg = DAG.getLoad(
    1601             :         ISD::UNINDEXED, Ext, VT, DL, Chain,
    1602       10298 :         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
    1603        5149 :         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
    1604        5149 :                                         MachineMemOperand::MODereferenceable |
    1605       20596 :                                         MachineMemOperand::MOInvariant);
    1606             : 
    1607             :     // 4 is the preferred alignment for the CONSTANT memory space.
    1608        5149 :     InVals.push_back(Arg);
    1609       15447 :     MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
    1610             :   }
    1611        4116 :   return Chain;
    1612             : }
    1613             : 
    1614       34642 : EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    1615             :                                            EVT VT) const {
    1616       34642 :    if (!VT.isVector())
    1617       34609 :      return MVT::i32;
    1618          33 :    return VT.changeVectorElementTypeToInteger();
    1619             : }
    1620             : 
    1621          87 : bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
    1622             :                                           const SelectionDAG &DAG) const {
    1623             :   // Local and Private addresses do not handle vectors. Limit to i32
    1624          87 :   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
    1625          86 :     return (MemVT.getSizeInBits() <= 32);
    1626             :   }
    1627             :   return true;
    1628             : }
    1629             : 
    1630         730 : bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    1631             :                                                         unsigned AddrSpace,
    1632             :                                                         unsigned Align,
    1633             :                                                         bool *IsFast) const {
    1634         730 :   if (IsFast)
    1635         513 :     *IsFast = false;
    1636             : 
    1637        2190 :   if (!VT.isSimple() || VT == MVT::Other)
    1638             :     return false;
    1639             : 
    1640         730 :   if (VT.bitsLT(MVT::i32))
    1641             :     return false;
    1642             : 
    1643             :   // TODO: This is a rough estimate.
    1644         699 :   if (IsFast)
    1645         506 :     *IsFast = true;
    1646             : 
    1647         699 :   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
    1648             : }
    1649             : 
    1650         392 : static SDValue CompactSwizzlableVector(
    1651             :   SelectionDAG &DAG, SDValue VectorEntry,
    1652             :   DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1653             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1654             :   assert(RemapSwizzle.empty());
    1655             :   SDValue NewBldVec[4] = {
    1656         784 :     VectorEntry.getOperand(0),
    1657         784 :     VectorEntry.getOperand(1),
    1658         784 :     VectorEntry.getOperand(2),
    1659         784 :     VectorEntry.getOperand(3)
    1660        1568 :   };
    1661             : 
    1662        1960 :   for (unsigned i = 0; i < 4; i++) {
    1663        3136 :     if (NewBldVec[i].isUndef())
    1664             :       // We mask write here to teach later passes that the ith element of this
    1665             :       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
    1666             :       // break false dependencies and additionnaly make assembly easier to read.
    1667         212 :       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
    1668        1596 :     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
    1669          28 :       if (C->isZero()) {
    1670          27 :         RemapSwizzle[i] = 4; // SEL_0
    1671          27 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1672           1 :       } else if (C->isExactlyValue(1.0)) {
    1673           1 :         RemapSwizzle[i] = 5; // SEL_1
    1674           1 :         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
    1675             :       }
    1676             :     }
    1677             : 
    1678        3136 :     if (NewBldVec[i].isUndef())
    1679             :       continue;
    1680        4974 :     for (unsigned j = 0; j < i; j++) {
    1681        3678 :       if (NewBldVec[i] == NewBldVec[j]) {
    1682          32 :         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
    1683          16 :         RemapSwizzle[i] = j;
    1684             :         break;
    1685             :       }
    1686             :     }
    1687             :   }
    1688             : 
    1689         784 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1690        1568 :                             NewBldVec);
    1691             : }
    1692             : 
    1693         392 : static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
    1694             :                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
    1695             :   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
    1696             :   assert(RemapSwizzle.empty());
    1697             :   SDValue NewBldVec[4] = {
    1698         784 :       VectorEntry.getOperand(0),
    1699         784 :       VectorEntry.getOperand(1),
    1700         784 :       VectorEntry.getOperand(2),
    1701         784 :       VectorEntry.getOperand(3)
    1702        1568 :   };
    1703         392 :   bool isUnmovable[4] = { false, false, false, false };
    1704        1960 :   for (unsigned i = 0; i < 4; i++) {
    1705        1568 :     RemapSwizzle[i] = i;
    1706        3136 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1707             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1708         188 :           ->getZExtValue();
    1709          47 :       if (i == Idx)
    1710          39 :         isUnmovable[Idx] = true;
    1711             :     }
    1712             :   }
    1713             : 
    1714        1949 :   for (unsigned i = 0; i < 4; i++) {
    1715        3126 :     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
    1716             :       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
    1717         184 :           ->getZExtValue();
    1718          46 :       if (isUnmovable[Idx])
    1719          40 :         continue;
    1720             :       // Swap i and Idx
    1721          12 :       std::swap(NewBldVec[Idx], NewBldVec[i]);
    1722          18 :       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
    1723           6 :       break;
    1724             :     }
    1725             :   }
    1726             : 
    1727         784 :   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
    1728        1568 :                             NewBldVec);
    1729             : }
    1730             : 
    1731         392 : SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
    1732             :                                             SelectionDAG &DAG,
    1733             :                                             const SDLoc &DL) const {
    1734             :   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
    1735             :   // Old -> New swizzle values
    1736         784 :   DenseMap<unsigned, unsigned> SwizzleRemap;
    1737             : 
    1738         392 :   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
    1739        1960 :   for (unsigned i = 0; i < 4; i++) {
    1740        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1741        3136 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1742         238 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1743             :   }
    1744             : 
    1745         392 :   SwizzleRemap.clear();
    1746         392 :   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
    1747        1960 :   for (unsigned i = 0; i < 4; i++) {
    1748        4704 :     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
    1749        3136 :     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
    1750        2700 :       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
    1751             :   }
    1752             : 
    1753         784 :   return BuildVector;
    1754             : }
    1755             : 
    1756             : //===----------------------------------------------------------------------===//
    1757             : // Custom DAG Optimizations
    1758             : //===----------------------------------------------------------------------===//
    1759             : 
    1760      193800 : SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
    1761             :                                               DAGCombinerInfo &DCI) const {
    1762      193800 :   SelectionDAG &DAG = DCI.DAG;
    1763      387600 :   SDLoc DL(N);
    1764             : 
    1765      387600 :   switch (N->getOpcode()) {
    1766             :   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
    1767           2 :   case ISD::FP_ROUND: {
    1768           4 :       SDValue Arg = N->getOperand(0);
    1769           6 :       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
    1770             :         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
    1771           3 :                            Arg.getOperand(0));
    1772             :       }
    1773           1 :       break;
    1774             :     }
    1775             : 
    1776             :   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
    1777             :   // (i32 select_cc f32, f32, -1, 0 cc)
    1778             :   //
    1779             :   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
    1780             :   // this to one of the SET*_DX10 instructions.
    1781          58 :   case ISD::FP_TO_SINT: {
    1782         116 :     SDValue FNeg = N->getOperand(0);
    1783         116 :     if (FNeg.getOpcode() != ISD::FNEG) {
    1784          42 :       return SDValue();
    1785             :     }
    1786          32 :     SDValue SelectCC = FNeg.getOperand(0);
    1787          16 :     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
    1788          24 :         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
    1789          32 :         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
    1790          40 :         !isHWTrueValue(SelectCC.getOperand(2)) ||
    1791          16 :         !isHWFalseValue(SelectCC.getOperand(3))) {
    1792           8 :       return SDValue();
    1793             :     }
    1794             : 
    1795             :     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
    1796          16 :                            SelectCC.getOperand(0), // LHS
    1797          16 :                            SelectCC.getOperand(1), // RHS
    1798           8 :                            DAG.getConstant(-1, DL, MVT::i32), // True
    1799           8 :                            DAG.getConstant(0, DL, MVT::i32),  // False
    1800          48 :                            SelectCC.getOperand(4)); // CC
    1801             : 
    1802             :     break;
    1803             :   }
    1804             : 
    1805             :   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
    1806             :   // => build_vector elt0, ... , NewEltIdx, ... , eltN
    1807         262 :   case ISD::INSERT_VECTOR_ELT: {
    1808         524 :     SDValue InVec = N->getOperand(0);
    1809         524 :     SDValue InVal = N->getOperand(1);
    1810         524 :     SDValue EltNo = N->getOperand(2);
    1811             : 
    1812             :     // If the inserted element is an UNDEF, just use the input vector.
    1813         524 :     if (InVal.isUndef())
    1814           0 :       return InVec;
    1815             : 
    1816         524 :     EVT VT = InVec.getValueType();
    1817             : 
    1818             :     // If we can't generate a legal BUILD_VECTOR, exit
    1819         524 :     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
    1820           0 :       return SDValue();
    1821             : 
    1822             :     // Check that we know which element is being inserted
    1823           2 :     if (!isa<ConstantSDNode>(EltNo))
    1824           2 :       return SDValue();
    1825         520 :     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
    1826             : 
    1827             :     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
    1828             :     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
    1829             :     // vector elements.
    1830         260 :     SmallVector<SDValue, 8> Ops;
    1831         520 :     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
    1832           8 :       Ops.append(InVec.getNode()->op_begin(),
    1833             :                  InVec.getNode()->op_end());
    1834         512 :     } else if (InVec.isUndef()) {
    1835           0 :       unsigned NElts = VT.getVectorNumElements();
    1836           0 :       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
    1837             :     } else {
    1838         256 :       return SDValue();
    1839             :     }
    1840             : 
    1841             :     // Insert the element
    1842           8 :     if (Elt < Ops.size()) {
    1843             :       // All the operands of BUILD_VECTOR must have the same type;
    1844             :       // we enforce that here.
    1845           8 :       EVT OpVT = Ops[0].getValueType();
    1846           8 :       if (InVal.getValueType() != OpVT)
    1847           0 :         InVal = OpVT.bitsGT(InVal.getValueType()) ?
    1848           0 :           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
    1849           0 :           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
    1850           8 :       Ops[Elt] = InVal;
    1851             :     }
    1852             : 
    1853             :     // Return the new vector
    1854           4 :     return DAG.getBuildVector(VT, DL, Ops);
    1855             :   }
    1856             : 
    1857             :   // Extract_vec (Build_vector) generated by custom lowering
    1858             :   // also needs to be customly combined
    1859       11344 :   case ISD::EXTRACT_VECTOR_ELT: {
    1860       22688 :     SDValue Arg = N->getOperand(0);
    1861       22688 :     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
    1862          50 :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1863           0 :         unsigned Element = Const->getZExtValue();
    1864          23 :         return Arg->getOperand(Element);
    1865             :       }
    1866             :     }
    1867       11540 :     if (Arg.getOpcode() == ISD::BITCAST &&
    1868       11564 :         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
    1869       11416 :         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
    1870       11392 :          Arg.getValueType().getVectorNumElements())) {
    1871          69 :       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
    1872          23 :         unsigned Element = Const->getZExtValue();
    1873             :         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
    1874          92 :                            Arg->getOperand(0).getOperand(Element));
    1875             :       }
    1876             :     }
    1877       11321 :     break;
    1878             :   }
    1879             : 
    1880       11280 :   case ISD::SELECT_CC: {
    1881             :     // Try common optimizations
    1882       11280 :     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
    1883           0 :       return Ret;
    1884             : 
    1885             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
    1886             :     //      selectcc x, y, a, b, inv(cc)
    1887             :     //
    1888             :     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
    1889             :     //      selectcc x, y, a, b, cc
    1890       22560 :     SDValue LHS = N->getOperand(0);
    1891       22560 :     if (LHS.getOpcode() != ISD::SELECT_CC) {
    1892        5567 :       return SDValue();
    1893             :     }
    1894             : 
    1895       11426 :     SDValue RHS = N->getOperand(1);
    1896       11426 :     SDValue True = N->getOperand(2);
    1897       11426 :     SDValue False = N->getOperand(3);
    1898       17139 :     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
    1899             : 
    1900       11439 :     if (LHS.getOperand(2).getNode() != True.getNode() ||
    1901        5738 :         LHS.getOperand(3).getNode() != False.getNode() ||
    1902          12 :         RHS.getNode() != False.getNode()) {
    1903        5709 :       return SDValue();
    1904             :     }
    1905             : 
    1906           4 :     switch (NCC) {
    1907           0 :     default: return SDValue();
    1908           1 :     case ISD::SETNE: return LHS;
    1909           3 :     case ISD::SETEQ: {
    1910           9 :       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
    1911           3 :       LHSCC = ISD::getSetCCInverse(LHSCC,
    1912          12 :                                   LHS.getOperand(0).getValueType().isInteger());
    1913           6 :       if (DCI.isBeforeLegalizeOps() ||
    1914          12 :           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
    1915             :         return DAG.getSelectCC(DL,
    1916           0 :                                LHS.getOperand(0),
    1917           0 :                                LHS.getOperand(1),
    1918           0 :                                LHS.getOperand(2),
    1919           0 :                                LHS.getOperand(3),
    1920           0 :                                LHSCC);
    1921             :       break;
    1922             :     }
    1923             :     }
    1924           3 :     return SDValue();
    1925             :   }
    1926             : 
    1927         138 :   case AMDGPUISD::R600_EXPORT: {
    1928         276 :     SDValue Arg = N->getOperand(1);
    1929         276 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1930             :       break;
    1931             : 
    1932             :     SDValue NewArgs[8] = {
    1933         240 :       N->getOperand(0), // Chain
    1934             :       SDValue(),
    1935         240 :       N->getOperand(2), // ArrayBase
    1936         240 :       N->getOperand(3), // Type
    1937         240 :       N->getOperand(4), // SWZ_X
    1938         240 :       N->getOperand(5), // SWZ_Y
    1939         240 :       N->getOperand(6), // SWZ_Z
    1940         240 :       N->getOperand(7) // SWZ_W
    1941         840 :     };
    1942         240 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
    1943         240 :     return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
    1944             :   }
    1945         296 :   case AMDGPUISD::TEXTURE_FETCH: {
    1946         592 :     SDValue Arg = N->getOperand(1);
    1947         592 :     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
    1948             :       break;
    1949             : 
    1950             :     SDValue NewArgs[19] = {
    1951         544 :       N->getOperand(0),
    1952         544 :       N->getOperand(1),
    1953         544 :       N->getOperand(2),
    1954         544 :       N->getOperand(3),
    1955         544 :       N->getOperand(4),
    1956         544 :       N->getOperand(5),
    1957         544 :       N->getOperand(6),
    1958         544 :       N->getOperand(7),
    1959         544 :       N->getOperand(8),
    1960         544 :       N->getOperand(9),
    1961         544 :       N->getOperand(10),
    1962         544 :       N->getOperand(11),
    1963         544 :       N->getOperand(12),
    1964         544 :       N->getOperand(13),
    1965         544 :       N->getOperand(14),
    1966         544 :       N->getOperand(15),
    1967         544 :       N->getOperand(16),
    1968         544 :       N->getOperand(17),
    1969         544 :       N->getOperand(18),
    1970        5168 :     };
    1971         544 :     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
    1972         544 :     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
    1973             :   }
    1974             :   default: break;
    1975             :   }
    1976             : 
    1977      181784 :   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
    1978             : }
    1979             : 
    1980      235642 : bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
    1981             :                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
    1982             :                                      SDValue &Sel, SDValue &Imm,
    1983             :                                      SelectionDAG &DAG) const {
    1984      471284 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    1985      471284 :   if (!Src.isMachineOpcode())
    1986             :     return false;
    1987             : 
    1988      302300 :   switch (Src.getMachineOpcode()) {
    1989         117 :   case AMDGPU::FNEG_R600:
    1990         117 :     if (!Neg.getNode())
    1991             :       return false;
    1992         186 :     Src = Src.getOperand(0);
    1993         372 :     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    1994          93 :     return true;
    1995         109 :   case AMDGPU::FABS_R600:
    1996         109 :     if (!Abs.getNode())
    1997             :       return false;
    1998         186 :     Src = Src.getOperand(0);
    1999         372 :     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
    2000          93 :     return true;
    2001       10266 :   case AMDGPU::CONST_COPY: {
    2002       20532 :     unsigned Opcode = ParentNode->getMachineOpcode();
    2003       10266 :     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2004             : 
    2005       10266 :     if (!Sel.getNode())
    2006             :       return false;
    2007             : 
    2008       17842 :     SDValue CstOffset = Src.getOperand(0);
    2009       26763 :     if (ParentNode->getValueType(0).isVector())
    2010             :       return false;
    2011             : 
    2012             :     // Gather constants values
    2013             :     int SrcIndices[] = {
    2014        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2015        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2016        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
    2017        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2018        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2019        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2020        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2021        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2022        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2023        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2024        8921 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2025       98131 :     };
    2026        8921 :     std::vector<unsigned> Consts;
    2027      107052 :     for (int OtherSrcIdx : SrcIndices) {
    2028       98131 :       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
    2029       98131 :       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
    2030       80054 :         continue;
    2031       18077 :       if (HasDst) {
    2032       18077 :         OtherSrcIdx--;
    2033       18077 :         OtherSelIdx--;
    2034             :       }
    2035             :       if (RegisterSDNode *Reg =
    2036       36814 :           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
    2037         660 :         if (Reg->getReg() == AMDGPU::ALU_CONST) {
    2038             :           ConstantSDNode *Cst
    2039        1524 :             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
    2040        1016 :           Consts.push_back(Cst->getZExtValue());
    2041             :         }
    2042             :       }
    2043             :     }
    2044             : 
    2045        8921 :     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
    2046       17842 :     Consts.push_back(Cst->getZExtValue());
    2047        8921 :     if (!TII->fitsConstReadLimitations(Consts)) {
    2048             :       return false;
    2049             :     }
    2050             : 
    2051        8913 :     Sel = CstOffset;
    2052        8913 :     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
    2053        8913 :     return true;
    2054             :   }
    2055          14 :   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
    2056             :     // Check if the Imm slot is used. Taken from below.
    2057          28 :     if (cast<ConstantSDNode>(Imm)->getZExtValue())
    2058             :       return false;
    2059          28 :     Imm = Src.getOperand(0);
    2060          14 :     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
    2061          14 :     return true;
    2062       26742 :   case AMDGPU::MOV_IMM_I32:
    2063             :   case AMDGPU::MOV_IMM_F32: {
    2064       26742 :     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
    2065       26742 :     uint64_t ImmValue = 0;
    2066             : 
    2067       53484 :     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
    2068        1398 :       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
    2069         466 :       float FloatValue = FPC->getValueAPF().convertToFloat();
    2070         466 :       if (FloatValue == 0.0) {
    2071             :         ImmReg = AMDGPU::ZERO;
    2072         324 :       } else if (FloatValue == 0.5) {
    2073             :         ImmReg = AMDGPU::HALF;
    2074         289 :       } else if (FloatValue == 1.0) {
    2075             :         ImmReg = AMDGPU::ONE;
    2076             :       } else {
    2077         663 :         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
    2078             :       }
    2079             :     } else {
    2080       78828 :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
    2081       26276 :       uint64_t Value = C->getZExtValue();
    2082       26276 :       if (Value == 0) {
    2083             :         ImmReg = AMDGPU::ZERO;
    2084       24632 :       } else if (Value == 1) {
    2085             :         ImmReg = AMDGPU::ONE_INT;
    2086             :       } else {
    2087             :         ImmValue = Value;
    2088             :       }
    2089             :     }
    2090             : 
    2091             :     // Check that we aren't already using an immediate.
    2092             :     // XXX: It's possible for an instruction to have more than one
    2093             :     // immediate operand, but this is not supported yet.
    2094             :     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
    2095       21938 :       if (!Imm.getNode())
    2096             :         return false;
    2097       21884 :       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
    2098             :       assert(C);
    2099       21884 :       if (C->getZExtValue())
    2100             :         return false;
    2101       78560 :       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
    2102             :     }
    2103       24444 :     Src = DAG.getRegister(ImmReg, MVT::i32);
    2104       24444 :     return true;
    2105             :   }
    2106             :   default:
    2107             :     return false;
    2108             :   }
    2109             : }
    2110             : 
    2111             : /// \brief Fold the instructions after selecting them
    2112      157242 : SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
    2113             :                                             SelectionDAG &DAG) const {
    2114      314484 :   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    2115      157242 :   if (!Node->isMachineOpcode())
    2116             :     return Node;
    2117             : 
    2118      314484 :   unsigned Opcode = Node->getMachineOpcode();
    2119      157242 :   SDValue FakeOp;
    2120             : 
    2121      471726 :   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
    2122             : 
    2123      157242 :   if (Opcode == AMDGPU::DOT_4) {
    2124             :     int OperandIdx[] = {
    2125         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
    2126         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
    2127         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
    2128         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
    2129         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
    2130         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
    2131         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
    2132         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
    2133        1144 :         };
    2134             :     int NegIdx[] = {
    2135         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
    2136         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
    2137         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
    2138         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
    2139         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
    2140         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
    2141         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
    2142         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
    2143        1144 :     };
    2144             :     int AbsIdx[] = {
    2145         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
    2146         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
    2147         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
    2148         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
    2149         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
    2150         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
    2151         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
    2152         143 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
    2153        1144 :     };
    2154        1025 :     for (unsigned i = 0; i < 8; i++) {
    2155         962 :       if (OperandIdx[i] < 0)
    2156          80 :         return Node;
    2157        1924 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2158        1924 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2159        1924 :       SDValue &Abs = Ops[AbsIdx[i] - 1];
    2160         962 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2161         962 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2162         962 :       if (HasDst)
    2163         962 :         SelIdx--;
    2164        1924 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2165         962 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
    2166         320 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2167             :     }
    2168      157099 :   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
    2169       30307 :     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
    2170       40040 :       SDValue &Src = Ops[i];
    2171       20020 :       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
    2172        5100 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2173             :     }
    2174      151318 :   } else if (Opcode == AMDGPU::CLAMP_R600) {
    2175           0 :     SDValue Src = Node->getOperand(0);
    2176           0 :     if (!Src.isMachineOpcode() ||
    2177           0 :         !TII->hasInstrModifiers(Src.getMachineOpcode()))
    2178             :       return Node;
    2179           0 :     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
    2180           0 :         AMDGPU::OpName::clamp);
    2181           0 :     if (ClampIdx < 0)
    2182             :       return Node;
    2183           0 :     SDLoc DL(Node);
    2184           0 :     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
    2185           0 :     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
    2186           0 :     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
    2187           0 :                               Node->getVTList(), Ops);
    2188             :   } else {
    2189      151318 :     if (!TII->hasInstrModifiers(Opcode))
    2190      137135 :       return Node;
    2191             :     int OperandIdx[] = {
    2192      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
    2193      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
    2194      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
    2195      310881 :     };
    2196             :     int NegIdx[] = {
    2197      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
    2198      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
    2199      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
    2200      310881 :     };
    2201      103627 :     int AbsIdx[] = {
    2202      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
    2203      103627 :       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
    2204             :       -1
    2205      207254 :     };
    2206      286085 :     for (unsigned i = 0; i < 3; i++) {
    2207      271902 :       if (OperandIdx[i] < 0)
    2208       89444 :         return Node;
    2209      429320 :       SDValue &Src = Ops[OperandIdx[i] - 1];
    2210      429320 :       SDValue &Neg = Ops[NegIdx[i] - 1];
    2211      214660 :       SDValue FakeAbs;
    2212      372430 :       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
    2213      214660 :       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
    2214      214660 :       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
    2215      214660 :       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
    2216      214660 :       if (HasDst) {
    2217      214660 :         SelIdx--;
    2218      214660 :         ImmIdx--;
    2219             :       }
    2220      429320 :       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
    2221      429320 :       SDValue &Imm = Ops[ImmIdx];
    2222      214660 :       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
    2223      128808 :         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
    2224             :     }
    2225             :   }
    2226             : 
    2227             :   return Node;
    2228             : }

Generated by: LCOV version 1.13