LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInstrInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1800 2187 82.3 %
Date: 2018-07-13 00:08:38 Functions: 109 122 89.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// SI Implementation of TargetInstrInfo.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIInstrInfo.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUIntrinsicInfo.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "GCNHazardRecognizer.h"
      20             : #include "SIDefines.h"
      21             : #include "SIMachineFunctionInfo.h"
      22             : #include "SIRegisterInfo.h"
      23             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      24             : #include "Utils/AMDGPUBaseInfo.h"
      25             : #include "llvm/ADT/APInt.h"
      26             : #include "llvm/ADT/ArrayRef.h"
      27             : #include "llvm/ADT/SmallVector.h"
      28             : #include "llvm/ADT/StringRef.h"
      29             : #include "llvm/ADT/iterator_range.h"
      30             : #include "llvm/Analysis/AliasAnalysis.h"
      31             : #include "llvm/Analysis/MemoryLocation.h"
      32             : #include "llvm/Analysis/ValueTracking.h"
      33             : #include "llvm/CodeGen/MachineBasicBlock.h"
      34             : #include "llvm/CodeGen/MachineFrameInfo.h"
      35             : #include "llvm/CodeGen/MachineFunction.h"
      36             : #include "llvm/CodeGen/MachineInstr.h"
      37             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      38             : #include "llvm/CodeGen/MachineInstrBundle.h"
      39             : #include "llvm/CodeGen/MachineMemOperand.h"
      40             : #include "llvm/CodeGen/MachineOperand.h"
      41             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      42             : #include "llvm/CodeGen/RegisterScavenging.h"
      43             : #include "llvm/CodeGen/ScheduleDAG.h"
      44             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      45             : #include "llvm/CodeGen/TargetOpcodes.h"
      46             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      47             : #include "llvm/IR/DebugLoc.h"
      48             : #include "llvm/IR/DiagnosticInfo.h"
      49             : #include "llvm/IR/Function.h"
      50             : #include "llvm/IR/InlineAsm.h"
      51             : #include "llvm/IR/LLVMContext.h"
      52             : #include "llvm/MC/MCInstrDesc.h"
      53             : #include "llvm/Support/Casting.h"
      54             : #include "llvm/Support/CommandLine.h"
      55             : #include "llvm/Support/Compiler.h"
      56             : #include "llvm/Support/ErrorHandling.h"
      57             : #include "llvm/Support/MachineValueType.h"
      58             : #include "llvm/Support/MathExtras.h"
      59             : #include "llvm/Target/TargetMachine.h"
      60             : #include <cassert>
      61             : #include <cstdint>
      62             : #include <iterator>
      63             : #include <utility>
      64             : 
      65             : using namespace llvm;
      66             : 
      67             : #define GET_INSTRINFO_CTOR_DTOR
      68             : #include "AMDGPUGenInstrInfo.inc"
      69             : 
      70             : namespace llvm {
      71             : namespace AMDGPU {
      72             : #define GET_D16ImageDimIntrinsics_IMPL
      73             : #define GET_ImageDimIntrinsicTable_IMPL
      74             : #define GET_RsrcIntrinsics_IMPL
      75             : #include "AMDGPUGenSearchableTables.inc"
      76             : }
      77             : }
      78             : 
      79             : 
      80             : // Must be at least 4 to be able to branch over minimum unconditional branch
      81             : // code. This is only for making it possible to write reasonably small tests for
      82             : // long branches.
      83             : static cl::opt<unsigned>
      84      299229 : BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
      85      199486 :                  cl::desc("Restrict range of branch instructions (DEBUG)"));
      86             : 
      87        2271 : SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
      88             :   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
      89        2271 :     RI(ST), ST(ST) {}
      90             : 
      91             : //===----------------------------------------------------------------------===//
      92             : // TargetInstrInfo callbacks
      93             : //===----------------------------------------------------------------------===//
      94             : 
      95             : static unsigned getNumOperandsNoGlue(SDNode *Node) {
      96      425126 :   unsigned N = Node->getNumOperands();
      97      920882 :   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
      98             :     --N;
      99             :   return N;
     100             : }
     101             : 
     102      418472 : static SDValue findChainOperand(SDNode *Load) {
     103      836944 :   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
     104             :   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
     105      418472 :   return LastOp;
     106             : }
     107             : 
     108             : /// Returns true if both nodes have the same value for the given
     109             : ///        operand \p Op, or if both nodes do not have this operand.
     110      593301 : static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
     111      593301 :   unsigned Opc0 = N0->getMachineOpcode();
     112      593301 :   unsigned Opc1 = N1->getMachineOpcode();
     113             : 
     114      593301 :   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
     115      593301 :   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
     116             : 
     117      593301 :   if (Op0Idx == -1 && Op1Idx == -1)
     118             :     return true;
     119             : 
     120             : 
     121      588673 :   if ((Op0Idx == -1 && Op1Idx != -1) ||
     122             :       (Op1Idx == -1 && Op0Idx != -1))
     123             :     return false;
     124             : 
     125             :   // getNamedOperandIdx returns the index for the MachineInstr's operands,
     126             :   // which includes the result as the first operand. We are indexing into the
     127             :   // MachineSDNode's operands, so we need to skip the result operand to get
     128             :   // the real index.
     129      588647 :   --Op0Idx;
     130      588647 :   --Op1Idx;
     131             : 
     132     1177294 :   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
     133             : }
     134             : 
     135       20487 : bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
     136             :                                                     AliasAnalysis *AA) const {
     137             :   // TODO: The generic check fails for VALU instructions that should be
     138             :   // rematerializable due to implicit reads of exec. We really want all of the
     139             :   // generic logic for this except for this.
     140       20487 :   switch (MI.getOpcode()) {
     141             :   case AMDGPU::V_MOV_B32_e32:
     142             :   case AMDGPU::V_MOV_B32_e64:
     143             :   case AMDGPU::V_MOV_B64_PSEUDO:
     144             :     return true;
     145       13770 :   default:
     146       13770 :     return false;
     147             :   }
     148             : }
     149             : 
     150      387788 : bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     151             :                                           int64_t &Offset0,
     152             :                                           int64_t &Offset1) const {
     153      387788 :   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
     154             :     return false;
     155             : 
     156             :   unsigned Opc0 = Load0->getMachineOpcode();
     157             :   unsigned Opc1 = Load1->getMachineOpcode();
     158             : 
     159             :   // Make sure both are actually loads.
     160     1690805 :   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
     161             :     return false;
     162             : 
     163      613222 :   if (isDS(Opc0) && isDS(Opc1)) {
     164             : 
     165             :     // FIXME: Handle this case:
     166        3327 :     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
     167             :       return false;
     168             : 
     169             :     // Check base reg.
     170        3311 :     if (Load0->getOperand(1) != Load1->getOperand(1))
     171             :       return false;
     172             : 
     173             :     // Check chain.
     174          16 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     175             :       return false;
     176             : 
     177             :     // Skip read2 / write2 variants for simplicity.
     178             :     // TODO: We should report true if the used offsets are adjacent (excluded
     179             :     // st64 versions).
     180          32 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
     181          16 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
     182             :       return false;
     183             : 
     184          48 :     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
     185          48 :     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
     186          16 :     return true;
     187             :   }
     188             : 
     189      370078 :   if (isSMRD(Opc0) && isSMRD(Opc1)) {
     190             :     // Skip time and cache invalidation instructions.
     191       41295 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
     192       20644 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
     193             :       return false;
     194             : 
     195             :     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
     196             : 
     197             :     // Check base reg.
     198       20637 :     if (Load0->getOperand(0) != Load1->getOperand(0))
     199             :       return false;
     200             : 
     201             :     const ConstantSDNode *Load0Offset =
     202             :         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
     203             :     const ConstantSDNode *Load1Offset =
     204             :         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
     205             : 
     206       17095 :     if (!Load0Offset || !Load1Offset)
     207             :       return false;
     208             : 
     209             :     // Check chain.
     210       17083 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     211             :       return false;
     212             : 
     213       34166 :     Offset0 = Load0Offset->getZExtValue();
     214       34166 :     Offset1 = Load1Offset->getZExtValue();
     215       17083 :     return true;
     216             :   }
     217             : 
     218             :   // MUBUF and MTBUF can access the same addresses.
     219      567210 :   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
     220             : 
     221             :     // MUBUF and MTBUF have vaddr at different indices.
     222      214747 :     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
     223      599021 :         findChainOperand(Load0) != findChainOperand(Load1) ||
     224      593301 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
     225      186417 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
     226             :       return false;
     227             : 
     228      185029 :     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
     229      185029 :     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
     230             : 
     231      185029 :     if (OffIdx0 == -1 || OffIdx1 == -1)
     232             :       return false;
     233             : 
     234             :     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
     235             :     // inlcude the output in the operand list, but SDNodes don't, we need to
     236             :     // subtract the index by one.
     237      185029 :     --OffIdx0;
     238      185029 :     --OffIdx1;
     239             : 
     240      370058 :     SDValue Off0 = Load0->getOperand(OffIdx0);
     241      370058 :     SDValue Off1 = Load1->getOperand(OffIdx1);
     242             : 
     243             :     // The offset might be a FrameIndexSDNode.
     244             :     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
     245             :       return false;
     246             : 
     247      370058 :     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
     248      370058 :     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
     249      185029 :     return true;
     250             :   }
     251             : 
     252             :   return false;
     253             : }
     254             : 
     255             : static bool isStride64(unsigned Opc) {
     256             :   switch (Opc) {
     257             :   case AMDGPU::DS_READ2ST64_B32:
     258             :   case AMDGPU::DS_READ2ST64_B64:
     259             :   case AMDGPU::DS_WRITE2ST64_B32:
     260             :   case AMDGPU::DS_WRITE2ST64_B64:
     261             :     return true;
     262             :   default:
     263             :     return false;
     264             :   }
     265             : }
     266             : 
     267     1041419 : bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     268             :                                         int64_t &Offset,
     269             :                                         const TargetRegisterInfo *TRI) const {
     270     1041419 :   unsigned Opc = LdSt.getOpcode();
     271             : 
     272     1041419 :   if (isDS(LdSt)) {
     273      127139 :     const MachineOperand *OffsetImm =
     274             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     275      127139 :     if (OffsetImm) {
     276             :       // Normal, single offset LDS instruction.
     277       63314 :       const MachineOperand *AddrReg =
     278             :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     279             : 
     280       63314 :       BaseReg = AddrReg->getReg();
     281       63314 :       Offset = OffsetImm->getImm();
     282       63314 :       return true;
     283             :     }
     284             : 
     285             :     // The 2 offset instructions use offset0 and offset1 instead. We can treat
     286             :     // these as a load with a single offset if the 2 offsets are consecutive. We
     287             :     // will use this for some partially aligned loads.
     288       63825 :     const MachineOperand *Offset0Imm =
     289             :         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
     290       63825 :     const MachineOperand *Offset1Imm =
     291             :         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
     292             : 
     293       63825 :     uint8_t Offset0 = Offset0Imm->getImm();
     294       63825 :     uint8_t Offset1 = Offset1Imm->getImm();
     295             : 
     296       63825 :     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
     297             :       // Each of these offsets is in element sized units, so we need to convert
     298             :       // to bytes of the individual reads.
     299             : 
     300             :       unsigned EltSize;
     301       49773 :       if (LdSt.mayLoad())
     302       25140 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
     303             :       else {
     304             :         assert(LdSt.mayStore());
     305       37203 :         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
     306       74406 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
     307             :       }
     308             : 
     309             :       if (isStride64(Opc))
     310           5 :         EltSize *= 64;
     311             : 
     312       49773 :       const MachineOperand *AddrReg =
     313             :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     314       49773 :       BaseReg = AddrReg->getReg();
     315       49773 :       Offset = EltSize * Offset0;
     316       49773 :       return true;
     317             :     }
     318             : 
     319             :     return false;
     320             :   }
     321             : 
     322     1012674 :   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     323      816018 :     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
     324     1632032 :     if (SOffset && SOffset->isReg())
     325             :       return false;
     326             : 
     327       93418 :     const MachineOperand *AddrReg =
     328             :         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     329       93418 :     if (!AddrReg)
     330             :       return false;
     331             : 
     332        5821 :     const MachineOperand *OffsetImm =
     333             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     334        5821 :     BaseReg = AddrReg->getReg();
     335        5821 :     Offset = OffsetImm->getImm();
     336             : 
     337        5821 :     if (SOffset) // soffset can be an inline immediate.
     338        5821 :       Offset += SOffset->getImm();
     339             : 
     340             :     return true;
     341             :   }
     342             : 
     343       98262 :   if (isSMRD(LdSt)) {
     344       24399 :     const MachineOperand *OffsetImm =
     345             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     346       24399 :     if (!OffsetImm)
     347             :       return false;
     348             : 
     349       24343 :     const MachineOperand *SBaseReg =
     350             :         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
     351       24343 :     BaseReg = SBaseReg->getReg();
     352       24343 :     Offset = OffsetImm->getImm();
     353       24343 :     return true;
     354             :   }
     355             : 
     356       73863 :   if (isFLAT(LdSt)) {
     357       72041 :     const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     358       72041 :     if (VAddr) {
     359             :       // Can't analyze 2 offsets.
     360       72041 :       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
     361             :         return false;
     362             : 
     363       72041 :       BaseReg = VAddr->getReg();
     364             :     } else {
     365             :       // scratch instructions have either vaddr or saddr.
     366           0 :       BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
     367             :     }
     368             : 
     369       72041 :     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
     370       72041 :     return true;
     371             :   }
     372             : 
     373             :   return false;
     374             : }
     375             : 
     376       21228 : static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
     377             :                                   const MachineInstr &MI2, unsigned BaseReg2) {
     378       21228 :   if (BaseReg1 == BaseReg2)
     379             :     return true;
     380             : 
     381        9202 :   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
     382             :     return false;
     383             : 
     384        8951 :   auto MO1 = *MI1.memoperands_begin();
     385        8951 :   auto MO2 = *MI2.memoperands_begin();
     386        8951 :   if (MO1->getAddrSpace() != MO2->getAddrSpace())
     387             :     return false;
     388             : 
     389             :   auto Base1 = MO1->getValue();
     390             :   auto Base2 = MO2->getValue();
     391        2869 :   if (!Base1 || !Base2)
     392             :     return false;
     393        2839 :   const MachineFunction &MF = *MI1.getParent()->getParent();
     394        2839 :   const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
     395             :   Base1 = GetUnderlyingObject(Base1, DL);
     396             :   Base2 = GetUnderlyingObject(Base1, DL);
     397             : 
     398        5665 :   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
     399             :     return false;
     400             : 
     401        2826 :   return Base1 == Base2;
     402             : }
     403             : 
     404       21228 : bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
     405             :                                       unsigned BaseReg1,
     406             :                                       MachineInstr &SecondLdSt,
     407             :                                       unsigned BaseReg2,
     408             :                                       unsigned NumLoads) const {
     409       21228 :   if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
     410             :     return false;
     411             : 
     412             :   const MachineOperand *FirstDst = nullptr;
     413             :   const MachineOperand *SecondDst = nullptr;
     414             : 
     415       14852 :   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
     416       29504 :       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
     417        1885 :       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     418             :     const unsigned MaxGlobalLoadCluster = 6;
     419        2084 :     if (NumLoads > MaxGlobalLoadCluster)
     420             :       return false;
     421             : 
     422        2084 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     423        2084 :     if (!FirstDst)
     424         756 :       FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     425        2084 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
     426        2084 :     if (!SecondDst)
     427         756 :       SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     428       22394 :   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
     429        9561 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
     430        9561 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
     431             :   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
     432        3141 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     433        3141 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     434             :   }
     435             : 
     436       14852 :   if (!FirstDst || !SecondDst)
     437             :     return false;
     438             : 
     439             :   // Try to limit clustering based on the total number of bytes loaded
     440             :   // rather than the number of instructions.  This is done to help reduce
     441             :   // register pressure.  The method used is somewhat inexact, though,
     442             :   // because it assumes that all loads in the cluster will load the
     443             :   // same number of bytes as FirstLdSt.
     444             : 
     445             :   // The unit of this value is bytes.
     446             :   // FIXME: This needs finer tuning.
     447             :   unsigned LoadClusterThreshold = 16;
     448             : 
     449             :   const MachineRegisterInfo &MRI =
     450       13404 :       FirstLdSt.getParent()->getParent()->getRegInfo();
     451       13404 :   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
     452             : 
     453       13404 :   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
     454             : }
     455             : 
     456             : // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
     457             : // the first 16 loads will be interleaved with the stores, and the next 16 will
     458             : // be clustered as expected. It should really split into 2 16 store batches.
     459             : //
     460             : // Loads are clustered until this returns false, rather than trying to schedule
     461             : // groups of stores. This also means we have to deal with saying different
     462             : // address space loads should be clustered, and ones which might cause bank
     463             : // conflicts.
     464             : //
     465             : // This might be deprecated so it might not be worth that much effort to fix.
     466       30133 : bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
     467             :                                           int64_t Offset0, int64_t Offset1,
     468             :                                           unsigned NumLoads) const {
     469             :   assert(Offset1 > Offset0 &&
     470             :          "Second offset should be larger than first offset!");
     471             :   // If we have less than 16 loads in a row, and the offsets are within 64
     472             :   // bytes, then schedule together.
     473             : 
     474             :   // A cacheline is 64 bytes (for global memory).
     475       30133 :   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
     476             : }
     477             : 
     478          10 : static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
     479             :                               MachineBasicBlock::iterator MI,
     480             :                               const DebugLoc &DL, unsigned DestReg,
     481             :                               unsigned SrcReg, bool KillSrc) {
     482          10 :   MachineFunction *MF = MBB.getParent();
     483             :   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
     484             :                                         "illegal SGPR to VGPR copy",
     485          20 :                                         DL, DS_Error);
     486          10 :   LLVMContext &C = MF->getFunction().getContext();
     487          10 :   C.diagnose(IllegalCopy);
     488             : 
     489          30 :   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
     490          10 :     .addReg(SrcReg, getKillRegState(KillSrc));
     491          10 : }
     492             : 
     493       54908 : void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     494             :                               MachineBasicBlock::iterator MI,
     495             :                               const DebugLoc &DL, unsigned DestReg,
     496             :                               unsigned SrcReg, bool KillSrc) const {
     497       54908 :   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
     498             : 
     499       54908 :   if (RC == &AMDGPU::VGPR_32RegClass) {
     500             :     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
     501             :            AMDGPU::SReg_32RegClass.contains(SrcReg));
     502       86628 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     503       28876 :       .addReg(SrcReg, getKillRegState(KillSrc));
     504       28876 :     return;
     505             :   }
     506             : 
     507       26032 :   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
     508             :       RC == &AMDGPU::SReg_32RegClass) {
     509       18985 :     if (SrcReg == AMDGPU::SCC) {
     510           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
     511             :           .addImm(-1)
     512             :           .addImm(0);
     513           0 :       return;
     514             :     }
     515             : 
     516       37970 :     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
     517           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     518           2 :       return;
     519             :     }
     520             : 
     521       56949 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     522       18983 :             .addReg(SrcReg, getKillRegState(KillSrc));
     523       18983 :     return;
     524             :   }
     525             : 
     526        7047 :   if (RC == &AMDGPU::SReg_64RegClass) {
     527        2520 :     if (DestReg == AMDGPU::VCC) {
     528          50 :       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     529          72 :         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
     530          24 :           .addReg(SrcReg, getKillRegState(KillSrc));
     531             :       } else {
     532             :         // FIXME: Hack until VReg_1 removed.
     533             :         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
     534           3 :         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
     535             :           .addImm(0)
     536           1 :           .addReg(SrcReg, getKillRegState(KillSrc));
     537             :       }
     538             : 
     539             :       return;
     540             :     }
     541             : 
     542        4988 :     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     543           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     544           2 :       return;
     545             :     }
     546             : 
     547        7479 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     548        2493 :             .addReg(SrcReg, getKillRegState(KillSrc));
     549        2493 :     return;
     550             :   }
     551             : 
     552        4527 :   if (DestReg == AMDGPU::SCC) {
     553             :     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
     554           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
     555           0 :       .addReg(SrcReg, getKillRegState(KillSrc))
     556             :       .addImm(0);
     557           0 :     return;
     558             :   }
     559             : 
     560             :   unsigned EltSize = 4;
     561             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     562        4527 :   if (RI.isSGPRClass(RC)) {
     563         143 :     if (RI.getRegSizeInBits(*RC) > 32) {
     564             :       Opcode =  AMDGPU::S_MOV_B64;
     565             :       EltSize = 8;
     566             :     } else {
     567             :       Opcode = AMDGPU::S_MOV_B32;
     568             :       EltSize = 4;
     569             :     }
     570             : 
     571         286 :     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
     572           6 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     573           6 :       return;
     574             :     }
     575             :   }
     576             : 
     577        4521 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
     578             :   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
     579             : 
     580       23345 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     581             :     unsigned SubIdx;
     582        9412 :     if (Forward)
     583        5734 :       SubIdx = SubIndices[Idx];
     584             :     else
     585        7356 :       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
     586             : 
     587             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     588       18824 :       get(Opcode), RI.getSubReg(DestReg, SubIdx));
     589             : 
     590        9412 :     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
     591             : 
     592        9412 :     if (Idx == 0)
     593        4521 :       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
     594             : 
     595        9412 :     bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
     596        9412 :     Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
     597             :   }
     598             : }
     599             : 
     600      277233 : int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
     601             :   int NewOpc;
     602             : 
     603             :   // Try to map original to commuted opcode
     604      277233 :   NewOpc = AMDGPU::getCommuteRev(Opcode);
     605      277233 :   if (NewOpc != -1)
     606             :     // Check if the commuted (REV) opcode exists on the target.
     607       16392 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     608             : 
     609             :   // Try to map commuted to original opcode
     610      260841 :   NewOpc = AMDGPU::getCommuteOrig(Opcode);
     611      260841 :   if (NewOpc != -1)
     612             :     // Check if the original (non-REV) opcode exists on the target.
     613       40126 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     614             : 
     615      220715 :   return Opcode;
     616             : }
     617             : 
     618           0 : void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
     619             :                                        MachineBasicBlock::iterator MI,
     620             :                                        const DebugLoc &DL, unsigned DestReg,
     621             :                                        int64_t Value) const {
     622           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     623             :   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
     624           0 :   if (RegClass == &AMDGPU::SReg_32RegClass ||
     625           0 :       RegClass == &AMDGPU::SGPR_32RegClass ||
     626           0 :       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
     627             :       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
     628           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     629             :       .addImm(Value);
     630           0 :     return;
     631             :   }
     632             : 
     633           0 :   if (RegClass == &AMDGPU::SReg_64RegClass ||
     634           0 :       RegClass == &AMDGPU::SGPR_64RegClass ||
     635             :       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
     636           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     637             :       .addImm(Value);
     638           0 :     return;
     639             :   }
     640             : 
     641           0 :   if (RegClass == &AMDGPU::VGPR_32RegClass) {
     642           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     643             :       .addImm(Value);
     644           0 :     return;
     645             :   }
     646           0 :   if (RegClass == &AMDGPU::VReg_64RegClass) {
     647           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
     648             :       .addImm(Value);
     649           0 :     return;
     650             :   }
     651             : 
     652             :   unsigned EltSize = 4;
     653             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     654           0 :   if (RI.isSGPRClass(RegClass)) {
     655           0 :     if (RI.getRegSizeInBits(*RegClass) > 32) {
     656             :       Opcode =  AMDGPU::S_MOV_B64;
     657             :       EltSize = 8;
     658             :     } else {
     659             :       Opcode = AMDGPU::S_MOV_B32;
     660             :       EltSize = 4;
     661             :     }
     662             :   }
     663             : 
     664           0 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
     665           0 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     666           0 :     int64_t IdxValue = Idx == 0 ? Value : 0;
     667             : 
     668             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     669           0 :       get(Opcode), RI.getSubReg(DestReg, Idx));
     670             :     Builder.addImm(IdxValue);
     671             :   }
     672             : }
     673             : 
     674             : const TargetRegisterClass *
     675           0 : SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
     676           0 :   return &AMDGPU::VGPR_32RegClass;
     677             : }
     678             : 
     679           0 : void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     680             :                                      MachineBasicBlock::iterator I,
     681             :                                      const DebugLoc &DL, unsigned DstReg,
     682             :                                      ArrayRef<MachineOperand> Cond,
     683             :                                      unsigned TrueReg,
     684             :                                      unsigned FalseReg) const {
     685           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     686             :   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
     687             :          "Not a VGPR32 reg");
     688             : 
     689           0 :   if (Cond.size() == 1) {
     690           0 :     unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     691           0 :     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     692             :       .add(Cond[0]);
     693           0 :     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     694           0 :       .addReg(FalseReg)
     695           0 :       .addReg(TrueReg)
     696           0 :       .addReg(SReg);
     697           0 :   } else if (Cond.size() == 2) {
     698             :     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
     699           0 :     switch (Cond[0].getImm()) {
     700             :     case SIInstrInfo::SCC_TRUE: {
     701           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     702           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     703             :         .addImm(-1)
     704             :         .addImm(0);
     705           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     706           0 :         .addReg(FalseReg)
     707           0 :         .addReg(TrueReg)
     708           0 :         .addReg(SReg);
     709           0 :       break;
     710             :     }
     711             :     case SIInstrInfo::SCC_FALSE: {
     712           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     713           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     714             :         .addImm(0)
     715             :         .addImm(-1);
     716           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     717           0 :         .addReg(FalseReg)
     718           0 :         .addReg(TrueReg)
     719           0 :         .addReg(SReg);
     720           0 :       break;
     721             :     }
     722           0 :     case SIInstrInfo::VCCNZ: {
     723           0 :       MachineOperand RegOp = Cond[1];
     724             :       RegOp.setImplicit(false);
     725           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     726           0 :       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     727             :         .add(RegOp);
     728           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     729           0 :           .addReg(FalseReg)
     730           0 :           .addReg(TrueReg)
     731           0 :           .addReg(SReg);
     732             :       break;
     733             :     }
     734           0 :     case SIInstrInfo::VCCZ: {
     735           0 :       MachineOperand RegOp = Cond[1];
     736             :       RegOp.setImplicit(false);
     737           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     738           0 :       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     739             :         .add(RegOp);
     740           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     741           0 :           .addReg(TrueReg)
     742           0 :           .addReg(FalseReg)
     743           0 :           .addReg(SReg);
     744             :       break;
     745             :     }
     746             :     case SIInstrInfo::EXECNZ: {
     747           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     748           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     749           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     750             :         .addImm(0);
     751           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     752             :         .addImm(-1)
     753             :         .addImm(0);
     754           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     755           0 :         .addReg(FalseReg)
     756           0 :         .addReg(TrueReg)
     757           0 :         .addReg(SReg);
     758           0 :       break;
     759             :     }
     760             :     case SIInstrInfo::EXECZ: {
     761           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     762           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     763           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     764             :         .addImm(0);
     765           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     766             :         .addImm(0)
     767             :         .addImm(-1);
     768           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     769           0 :         .addReg(FalseReg)
     770           0 :         .addReg(TrueReg)
     771           0 :         .addReg(SReg);
     772           0 :       llvm_unreachable("Unhandled branch predicate EXECZ");
     773             :       break;
     774             :     }
     775           0 :     default:
     776           0 :       llvm_unreachable("invalid branch predicate");
     777             :     }
     778             :   } else {
     779           0 :     llvm_unreachable("Can only handle Cond size 1 or 2");
     780             :   }
     781           0 : }
     782             : 
     783           0 : unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
     784             :                                MachineBasicBlock::iterator I,
     785             :                                const DebugLoc &DL,
     786             :                                unsigned SrcReg, int Value) const {
     787           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     788           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     789           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
     790           0 :     .addImm(Value)
     791           0 :     .addReg(SrcReg);
     792             : 
     793           0 :   return Reg;
     794             : }
     795             : 
     796           0 : unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
     797             :                                MachineBasicBlock::iterator I,
     798             :                                const DebugLoc &DL,
     799             :                                unsigned SrcReg, int Value) const {
     800           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     801           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     802           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
     803           0 :     .addImm(Value)
     804           0 :     .addReg(SrcReg);
     805             : 
     806           0 :   return Reg;
     807             : }
     808             : 
     809       10747 : unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
     810             : 
     811       10747 :   if (RI.getRegSizeInBits(*DstRC) == 32) {
     812       20670 :     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     813         824 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
     814             :     return AMDGPU::S_MOV_B64;
     815         812 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
     816             :     return  AMDGPU::V_MOV_B64_PSEUDO;
     817             :   }
     818             :   return AMDGPU::COPY;
     819             : }
     820             : 
     821             : static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
     822         674 :   switch (Size) {
     823             :   case 4:
     824             :     return AMDGPU::SI_SPILL_S32_SAVE;
     825          96 :   case 8:
     826             :     return AMDGPU::SI_SPILL_S64_SAVE;
     827          61 :   case 16:
     828             :     return AMDGPU::SI_SPILL_S128_SAVE;
     829          33 :   case 32:
     830             :     return AMDGPU::SI_SPILL_S256_SAVE;
     831           8 :   case 64:
     832             :     return AMDGPU::SI_SPILL_S512_SAVE;
     833           0 :   default:
     834           0 :     llvm_unreachable("unknown register size");
     835             :   }
     836             : }
     837             : 
     838             : static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     839        1191 :   switch (Size) {
     840             :   case 4:
     841             :     return AMDGPU::SI_SPILL_V32_SAVE;
     842           1 :   case 8:
     843             :     return AMDGPU::SI_SPILL_V64_SAVE;
     844           0 :   case 12:
     845             :     return AMDGPU::SI_SPILL_V96_SAVE;
     846         665 :   case 16:
     847             :     return AMDGPU::SI_SPILL_V128_SAVE;
     848           0 :   case 32:
     849             :     return AMDGPU::SI_SPILL_V256_SAVE;
     850           0 :   case 64:
     851             :     return AMDGPU::SI_SPILL_V512_SAVE;
     852           0 :   default:
     853           0 :     llvm_unreachable("unknown register size");
     854             :   }
     855             : }
     856             : 
     857        1865 : void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     858             :                                       MachineBasicBlock::iterator MI,
     859             :                                       unsigned SrcReg, bool isKill,
     860             :                                       int FrameIndex,
     861             :                                       const TargetRegisterClass *RC,
     862             :                                       const TargetRegisterInfo *TRI) const {
     863        1865 :   MachineFunction *MF = MBB.getParent();
     864        1865 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     865        1865 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     866             :   DebugLoc DL = MBB.findDebugLoc(MI);
     867             : 
     868             :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     869             :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     870             :   MachinePointerInfo PtrInfo
     871        1865 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     872             :   MachineMemOperand *MMO
     873        1865 :     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
     874        1865 :                                Size, Align);
     875             :   unsigned SpillSize = TRI->getSpillSize(*RC);
     876             : 
     877        3730 :   if (RI.isSGPRClass(RC)) {
     878             :     MFI->setHasSpilledSGPRs();
     879             : 
     880             :     // We are only allowed to create one new instruction when spilling
     881             :     // registers, so we need to use pseudo instruction for spilling SGPRs.
     882         674 :     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
     883             : 
     884             :     // The SGPR spill/restore instructions only work on number sgprs, so we need
     885             :     // to make sure we are using the correct register class.
     886         674 :     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
     887          25 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     888          25 :       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     889             :     }
     890             : 
     891        1348 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
     892         674 :       .addReg(SrcReg, getKillRegState(isKill)) // data
     893             :       .addFrameIndex(FrameIndex)               // addr
     894             :       .addMemOperand(MMO)
     895         674 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
     896         674 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
     897             :     // Add the scratch resource registers as implicit uses because we may end up
     898             :     // needing them, and need to ensure that the reserved registers are
     899             :     // correctly handled.
     900             : 
     901             :     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     902         674 :     if (ST.hasScalarStores()) {
     903             :       // m0 is used for offset to scalar stores if used to spill.
     904         341 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     905             :     }
     906             : 
     907             :     return;
     908             :   }
     909             : 
     910        1191 :   if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
     911           0 :     LLVMContext &Ctx = MF->getFunction().getContext();
     912           0 :     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
     913             :                   " spill register");
     914           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
     915           0 :       .addReg(SrcReg);
     916             : 
     917           0 :     return;
     918             :   }
     919             : 
     920             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
     921             : 
     922             :   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
     923             :   MFI->setHasSpilledVGPRs();
     924        3573 :   BuildMI(MBB, MI, DL, get(Opcode))
     925        1191 :     .addReg(SrcReg, getKillRegState(isKill)) // data
     926             :     .addFrameIndex(FrameIndex)               // addr
     927        1191 :     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
     928        1191 :     .addReg(MFI->getFrameOffsetReg())        // scratch_offset
     929             :     .addImm(0)                               // offset
     930             :     .addMemOperand(MMO);
     931             : }
     932             : 
     933             : static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
     934         663 :   switch (Size) {
     935             :   case 4:
     936             :     return AMDGPU::SI_SPILL_S32_RESTORE;
     937          93 :   case 8:
     938             :     return AMDGPU::SI_SPILL_S64_RESTORE;
     939          60 :   case 16:
     940             :     return AMDGPU::SI_SPILL_S128_RESTORE;
     941          33 :   case 32:
     942             :     return AMDGPU::SI_SPILL_S256_RESTORE;
     943           8 :   case 64:
     944             :     return AMDGPU::SI_SPILL_S512_RESTORE;
     945           0 :   default:
     946           0 :     llvm_unreachable("unknown register size");
     947             :   }
     948             : }
     949             : 
     950             : static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     951        1105 :   switch (Size) {
     952             :   case 4:
     953             :     return AMDGPU::SI_SPILL_V32_RESTORE;
     954           1 :   case 8:
     955             :     return AMDGPU::SI_SPILL_V64_RESTORE;
     956           0 :   case 12:
     957             :     return AMDGPU::SI_SPILL_V96_RESTORE;
     958         670 :   case 16:
     959             :     return AMDGPU::SI_SPILL_V128_RESTORE;
     960           0 :   case 32:
     961             :     return AMDGPU::SI_SPILL_V256_RESTORE;
     962           0 :   case 64:
     963             :     return AMDGPU::SI_SPILL_V512_RESTORE;
     964           0 :   default:
     965           0 :     llvm_unreachable("unknown register size");
     966             :   }
     967             : }
     968             : 
     969        1768 : void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     970             :                                        MachineBasicBlock::iterator MI,
     971             :                                        unsigned DestReg, int FrameIndex,
     972             :                                        const TargetRegisterClass *RC,
     973             :                                        const TargetRegisterInfo *TRI) const {
     974        1768 :   MachineFunction *MF = MBB.getParent();
     975        1768 :   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     976        1768 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     977             :   DebugLoc DL = MBB.findDebugLoc(MI);
     978             :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     979             :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     980             :   unsigned SpillSize = TRI->getSpillSize(*RC);
     981             : 
     982             :   MachinePointerInfo PtrInfo
     983        1768 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     984             : 
     985        1768 :   MachineMemOperand *MMO = MF->getMachineMemOperand(
     986        1768 :     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
     987             : 
     988        3536 :   if (RI.isSGPRClass(RC)) {
     989             :     // FIXME: Maybe this should not include a memoperand because it will be
     990             :     // lowered to non-memory instructions.
     991         663 :     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
     992         663 :     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
     993          25 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     994          25 :       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     995             :     }
     996             : 
     997             :     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     998        1326 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
     999             :       .addFrameIndex(FrameIndex) // addr
    1000             :       .addMemOperand(MMO)
    1001         663 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
    1002         663 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
    1003             : 
    1004         663 :     if (ST.hasScalarStores()) {
    1005             :       // m0 is used for offset to scalar stores if used to spill.
    1006         337 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
    1007             :     }
    1008             : 
    1009             :     return;
    1010             :   }
    1011             : 
    1012        1105 :   if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
    1013           0 :     LLVMContext &Ctx = MF->getFunction().getContext();
    1014           0 :     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
    1015             :                   " restore register");
    1016           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
    1017             : 
    1018           0 :     return;
    1019             :   }
    1020             : 
    1021             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
    1022             : 
    1023             :   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
    1024        3315 :   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
    1025             :     .addFrameIndex(FrameIndex)        // vaddr
    1026        1105 :     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
    1027        1105 :     .addReg(MFI->getFrameOffsetReg()) // scratch_offset
    1028             :     .addImm(0)                        // offset
    1029             :     .addMemOperand(MMO);
    1030             : }
    1031             : 
    1032             : /// \param @Offset Offset in bytes of the FrameIndex being spilled
    1033           0 : unsigned SIInstrInfo::calculateLDSSpillAddress(
    1034             :     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
    1035             :     unsigned FrameOffset, unsigned Size) const {
    1036           0 :   MachineFunction *MF = MBB.getParent();
    1037           0 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    1038           0 :   const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
    1039             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1040           0 :   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
    1041           0 :   unsigned WavefrontSize = ST.getWavefrontSize();
    1042             : 
    1043           0 :   unsigned TIDReg = MFI->getTIDReg();
    1044           0 :   if (!MFI->hasCalculatedTID()) {
    1045           0 :     MachineBasicBlock &Entry = MBB.getParent()->front();
    1046             :     MachineBasicBlock::iterator Insert = Entry.front();
    1047             :     DebugLoc DL = Insert->getDebugLoc();
    1048             : 
    1049           0 :     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
    1050             :                                    *MF);
    1051           0 :     if (TIDReg == AMDGPU::NoRegister)
    1052             :       return TIDReg;
    1053             : 
    1054           0 :     if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
    1055             :         WorkGroupSize > WavefrontSize) {
    1056             :       unsigned TIDIGXReg
    1057             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    1058             :       unsigned TIDIGYReg
    1059             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    1060             :       unsigned TIDIGZReg
    1061             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    1062             :       unsigned InputPtrReg =
    1063             :           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1064           0 :       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
    1065           0 :         if (!Entry.isLiveIn(Reg))
    1066             :           Entry.addLiveIn(Reg);
    1067             :       }
    1068             : 
    1069           0 :       RS->enterBasicBlock(Entry);
    1070             :       // FIXME: Can we scavenge an SReg_64 and access the subregs?
    1071             :       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    1072             :       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    1073           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
    1074           0 :               .addReg(InputPtrReg)
    1075             :               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
    1076           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
    1077           0 :               .addReg(InputPtrReg)
    1078             :               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
    1079             : 
    1080             :       // NGROUPS.X * NGROUPS.Y
    1081           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
    1082           0 :               .addReg(STmp1)
    1083           0 :               .addReg(STmp0);
    1084             :       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
    1085           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
    1086           0 :               .addReg(STmp1)
    1087           0 :               .addReg(TIDIGXReg);
    1088             :       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
    1089           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
    1090           0 :               .addReg(STmp0)
    1091           0 :               .addReg(TIDIGYReg)
    1092           0 :               .addReg(TIDReg);
    1093             :       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
    1094           0 :       getAddNoCarry(Entry, Insert, DL, TIDReg)
    1095           0 :         .addReg(TIDReg)
    1096           0 :         .addReg(TIDIGZReg);
    1097             :     } else {
    1098             :       // Get the wave id
    1099           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
    1100           0 :               TIDReg)
    1101             :               .addImm(-1)
    1102             :               .addImm(0);
    1103             : 
    1104           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
    1105           0 :               TIDReg)
    1106             :               .addImm(-1)
    1107           0 :               .addReg(TIDReg);
    1108             :     }
    1109             : 
    1110           0 :     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
    1111           0 :             TIDReg)
    1112             :             .addImm(2)
    1113           0 :             .addReg(TIDReg);
    1114             :     MFI->setTIDReg(TIDReg);
    1115             :   }
    1116             : 
    1117             :   // Add FrameIndex to LDS offset
    1118           0 :   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
    1119           0 :   getAddNoCarry(MBB, MI, DL, TmpReg)
    1120           0 :     .addImm(LDSOffset)
    1121           0 :     .addReg(TIDReg);
    1122             : 
    1123           0 :   return TmpReg;
    1124             : }
    1125             : 
    1126         882 : void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
    1127             :                                    MachineBasicBlock::iterator MI,
    1128             :                                    int Count) const {
    1129             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1130        2646 :   while (Count > 0) {
    1131             :     int Arg;
    1132         882 :     if (Count >= 8)
    1133             :       Arg = 7;
    1134             :     else
    1135         882 :       Arg = Count - 1;
    1136         882 :     Count -= 8;
    1137        2646 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
    1138         882 :             .addImm(Arg);
    1139             :   }
    1140         882 : }
    1141             : 
    1142         882 : void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
    1143             :                              MachineBasicBlock::iterator MI) const {
    1144         882 :   insertWaitStates(MBB, MI, 1);
    1145         882 : }
    1146             : 
    1147           0 : void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
    1148           0 :   auto MF = MBB.getParent();
    1149           0 :   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1150             : 
    1151             :   assert(Info->isEntryFunction());
    1152             : 
    1153           0 :   if (MBB.succ_empty()) {
    1154           0 :     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
    1155           0 :     if (HasNoTerminator)
    1156           0 :       BuildMI(MBB, MBB.end(), DebugLoc(),
    1157           0 :               get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
    1158             :   }
    1159           0 : }
    1160             : 
    1161      489545 : unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
    1162      979090 :   switch (MI.getOpcode()) {
    1163             :   default: return 1; // FIXME: Do wait states equal cycles?
    1164             : 
    1165         499 :   case AMDGPU::S_NOP:
    1166         499 :     return MI.getOperand(0).getImm() + 1;
    1167             :   }
    1168             : }
    1169             : 
    1170      266032 : bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
    1171      266032 :   MachineBasicBlock &MBB = *MI.getParent();
    1172             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1173      532064 :   switch (MI.getOpcode()) {
    1174             :   default: return TargetInstrInfo::expandPostRAPseudo(MI);
    1175           1 :   case AMDGPU::S_MOV_B64_term:
    1176             :     // This is only a terminator to get the correct spill code placement during
    1177             :     // register allocation.
    1178           1 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1179             :     break;
    1180             : 
    1181           0 :   case AMDGPU::S_XOR_B64_term:
    1182             :     // This is only a terminator to get the correct spill code placement during
    1183             :     // register allocation.
    1184           0 :     MI.setDesc(get(AMDGPU::S_XOR_B64));
    1185             :     break;
    1186             : 
    1187           0 :   case AMDGPU::S_ANDN2_B64_term:
    1188             :     // This is only a terminator to get the correct spill code placement during
    1189             :     // register allocation.
    1190           0 :     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
    1191             :     break;
    1192             : 
    1193         317 :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1194         317 :     unsigned Dst = MI.getOperand(0).getReg();
    1195         317 :     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
    1196         317 :     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
    1197             : 
    1198         317 :     const MachineOperand &SrcOp = MI.getOperand(1);
    1199             :     // FIXME: Will this work for 64-bit floating point immediates?
    1200             :     assert(!SrcOp.isFPImm());
    1201         317 :     if (SrcOp.isImm()) {
    1202         317 :       APInt Imm(64, SrcOp.getImm());
    1203         951 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1204         951 :         .addImm(Imm.getLoBits(32).getZExtValue())
    1205         317 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1206         951 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1207         951 :         .addImm(Imm.getHiBits(32).getZExtValue())
    1208         317 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1209             :     } else {
    1210             :       assert(SrcOp.isReg());
    1211           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1212           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
    1213           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1214           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1215           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
    1216           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1217             :     }
    1218         317 :     MI.eraseFromParent();
    1219         317 :     break;
    1220             :   }
    1221           8 :   case AMDGPU::V_SET_INACTIVE_B32: {
    1222          24 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1223           8 :       .addReg(AMDGPU::EXEC);
    1224          24 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
    1225           8 :       .add(MI.getOperand(2));
    1226          24 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1227           8 :       .addReg(AMDGPU::EXEC);
    1228           8 :     MI.eraseFromParent();
    1229           8 :     break;
    1230             :   }
    1231           2 :   case AMDGPU::V_SET_INACTIVE_B64: {
    1232           6 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1233           2 :       .addReg(AMDGPU::EXEC);
    1234           4 :     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
    1235           2 :                                  MI.getOperand(0).getReg())
    1236           2 :       .add(MI.getOperand(2));
    1237           2 :     expandPostRAPseudo(*Copy);
    1238           6 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1239           2 :       .addReg(AMDGPU::EXEC);
    1240           2 :     MI.eraseFromParent();
    1241           2 :     break;
    1242             :   }
    1243          66 :   case AMDGPU::V_MOVRELD_B32_V1:
    1244             :   case AMDGPU::V_MOVRELD_B32_V2:
    1245             :   case AMDGPU::V_MOVRELD_B32_V4:
    1246             :   case AMDGPU::V_MOVRELD_B32_V8:
    1247             :   case AMDGPU::V_MOVRELD_B32_V16: {
    1248          66 :     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
    1249          66 :     unsigned VecReg = MI.getOperand(0).getReg();
    1250             :     bool IsUndef = MI.getOperand(1).isUndef();
    1251          66 :     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
    1252             :     assert(VecReg == MI.getOperand(1).getReg());
    1253             : 
    1254             :     MachineInstr *MovRel =
    1255         132 :         BuildMI(MBB, MI, DL, MovRelDesc)
    1256          66 :             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
    1257          66 :             .add(MI.getOperand(2))
    1258          66 :             .addReg(VecReg, RegState::ImplicitDefine)
    1259             :             .addReg(VecReg,
    1260          66 :                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
    1261             : 
    1262             :     const int ImpDefIdx =
    1263         198 :         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
    1264          66 :     const int ImpUseIdx = ImpDefIdx + 1;
    1265          66 :     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
    1266             : 
    1267          66 :     MI.eraseFromParent();
    1268          66 :     break;
    1269             :   }
    1270         521 :   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
    1271         521 :     MachineFunction &MF = *MBB.getParent();
    1272         521 :     unsigned Reg = MI.getOperand(0).getReg();
    1273         521 :     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
    1274         521 :     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
    1275             : 
    1276             :     // Create a bundle so these instructions won't be re-ordered by the
    1277             :     // post-RA scheduler.
    1278             :     MIBundleBuilder Bundler(MBB, MI);
    1279        1563 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
    1280             : 
    1281             :     // Add 32-bit offset from this instruction to the start of the
    1282             :     // constant data.
    1283        1563 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
    1284         521 :                        .addReg(RegLo)
    1285        1042 :                        .add(MI.getOperand(1)));
    1286             : 
    1287        1563 :     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
    1288         521 :                                   .addReg(RegHi);
    1289        1042 :     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
    1290             :       MIB.addImm(0);
    1291             :     else
    1292             :       MIB.add(MI.getOperand(2));
    1293             : 
    1294             :     Bundler.append(MIB);
    1295         521 :     finalizeBundle(MBB, Bundler.begin());
    1296             : 
    1297         521 :     MI.eraseFromParent();
    1298             :     break;
    1299             :   }
    1300          18 :   case AMDGPU::EXIT_WWM: {
    1301             :     // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
    1302             :     // is exited.
    1303          18 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1304             :     break;
    1305             :   }
    1306          22 :   case TargetOpcode::BUNDLE: {
    1307          22 :     if (!MI.mayLoad())
    1308             :       return false;
    1309             : 
    1310             :     // If it is a load it must be a memory clause
    1311          22 :     for (MachineBasicBlock::instr_iterator I = MI.getIterator();
    1312          89 :          I->isBundledWithSucc(); ++I) {
    1313          67 :       I->unbundleFromSucc();
    1314         771 :       for (MachineOperand &MO : I->operands())
    1315         352 :         if (MO.isReg())
    1316             :           MO.setIsInternalRead(false);
    1317             :     }
    1318             : 
    1319          22 :     MI.eraseFromParent();
    1320          22 :     break;
    1321             :   }
    1322             :   }
    1323             :   return true;
    1324             : }
    1325             : 
    1326      220014 : bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
    1327             :                                       MachineOperand &Src0,
    1328             :                                       unsigned Src0OpName,
    1329             :                                       MachineOperand &Src1,
    1330             :                                       unsigned Src1OpName) const {
    1331      220014 :   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
    1332      220014 :   if (!Src0Mods)
    1333             :     return false;
    1334             : 
    1335       47188 :   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
    1336             :   assert(Src1Mods &&
    1337             :          "All commutable instructions have both src0 and src1 modifiers");
    1338             : 
    1339       47188 :   int Src0ModsVal = Src0Mods->getImm();
    1340       47188 :   int Src1ModsVal = Src1Mods->getImm();
    1341             : 
    1342       47188 :   Src1Mods->setImm(Src0ModsVal);
    1343       47188 :   Src0Mods->setImm(Src1ModsVal);
    1344       47188 :   return true;
    1345             : }
    1346             : 
    1347       39372 : static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
    1348             :                                              MachineOperand &RegOp,
    1349             :                                              MachineOperand &NonRegOp) {
    1350       39372 :   unsigned Reg = RegOp.getReg();
    1351             :   unsigned SubReg = RegOp.getSubReg();
    1352             :   bool IsKill = RegOp.isKill();
    1353             :   bool IsDead = RegOp.isDead();
    1354             :   bool IsUndef = RegOp.isUndef();
    1355             :   bool IsDebug = RegOp.isDebug();
    1356             : 
    1357       39372 :   if (NonRegOp.isImm())
    1358       39372 :     RegOp.ChangeToImmediate(NonRegOp.getImm());
    1359           0 :   else if (NonRegOp.isFI())
    1360           0 :     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
    1361             :   else
    1362             :     return nullptr;
    1363             : 
    1364       39372 :   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
    1365             :   NonRegOp.setSubReg(SubReg);
    1366             : 
    1367       39372 :   return &MI;
    1368             : }
    1369             : 
    1370      270140 : MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
    1371             :                                                   unsigned Src0Idx,
    1372             :                                                   unsigned Src1Idx) const {
    1373             :   assert(!NewMI && "this should never be used");
    1374             : 
    1375      270140 :   unsigned Opc = MI.getOpcode();
    1376      270140 :   int CommutedOpcode = commuteOpcode(Opc);
    1377      270140 :   if (CommutedOpcode == -1)
    1378             :     return nullptr;
    1379             : 
    1380             :   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
    1381             :            static_cast<int>(Src0Idx) &&
    1382             :          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
    1383             :            static_cast<int>(Src1Idx) &&
    1384             :          "inconsistency with findCommutedOpIndices");
    1385             : 
    1386      250826 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    1387             :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    1388             : 
    1389             :   MachineInstr *CommutedMI = nullptr;
    1390      470402 :   if (Src0.isReg() && Src1.isReg()) {
    1391      196059 :     if (isOperandLegal(MI, Src1Idx, &Src0)) {
    1392             :       // Be sure to copy the source modifiers to the right place.
    1393             :       CommutedMI
    1394      180642 :         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
    1395             :     }
    1396             : 
    1397       78284 :   } else if (Src0.isReg() && !Src1.isReg()) {
    1398             :     // src0 should always be able to support any operand type, so no need to
    1399             :     // check operand legality.
    1400       23517 :     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
    1401       62500 :   } else if (!Src0.isReg() && Src1.isReg()) {
    1402       31229 :     if (isOperandLegal(MI, Src1Idx, &Src0))
    1403       15855 :       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
    1404             :   } else {
    1405             :     // FIXME: Found two non registers to commute. This does happen.
    1406             :     return nullptr;
    1407             :   }
    1408             : 
    1409      220014 :   if (CommutedMI) {
    1410      220014 :     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
    1411             :                         Src1, AMDGPU::OpName::src1_modifiers);
    1412             : 
    1413      220014 :     CommutedMI->setDesc(get(CommutedOpcode));
    1414             :   }
    1415             : 
    1416             :   return CommutedMI;
    1417             : }
    1418             : 
    1419             : // This needs to be implemented because the source modifiers may be inserted
    1420             : // between the true commutable operands, and the base
    1421             : // TargetInstrInfo::commuteInstruction uses it.
    1422      318098 : bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
    1423             :                                         unsigned &SrcOpIdx1) const {
    1424      318098 :   if (!MI.isCommutable())
    1425             :     return false;
    1426             : 
    1427      273093 :   unsigned Opc = MI.getOpcode();
    1428      273093 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    1429      273093 :   if (Src0Idx == -1)
    1430             :     return false;
    1431             : 
    1432      273093 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    1433      273093 :   if (Src1Idx == -1)
    1434             :     return false;
    1435             : 
    1436      273093 :   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
    1437             : }
    1438             : 
    1439        1025 : bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
    1440             :                                         int64_t BrOffset) const {
    1441             :   // BranchRelaxation should never have to check s_setpc_b64 because its dest
    1442             :   // block is unanalyzable.
    1443             :   assert(BranchOp != AMDGPU::S_SETPC_B64);
    1444             : 
    1445             :   // Convert to dwords.
    1446        1025 :   BrOffset /= 4;
    1447             : 
    1448             :   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
    1449             :   // from the next instruction.
    1450        1025 :   BrOffset -= 1;
    1451             : 
    1452        1025 :   return isIntN(BranchOffsetBits, BrOffset);
    1453             : }
    1454             : 
    1455        1057 : MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
    1456             :   const MachineInstr &MI) const {
    1457        2114 :   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
    1458             :     // This would be a difficult analysis to perform, but can always be legal so
    1459             :     // there's no need to analyze it.
    1460             :     return nullptr;
    1461             :   }
    1462             : 
    1463        1057 :   return MI.getOperand(0).getMBB();
    1464             : }
    1465             : 
    1466          32 : unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
    1467             :                                            MachineBasicBlock &DestBB,
    1468             :                                            const DebugLoc &DL,
    1469             :                                            int64_t BrOffset,
    1470             :                                            RegScavenger *RS) const {
    1471             :   assert(RS && "RegScavenger required for long branching");
    1472             :   assert(MBB.empty() &&
    1473             :          "new block should be inserted for expanding unconditional branch");
    1474             :   assert(MBB.pred_size() == 1);
    1475             : 
    1476          32 :   MachineFunction *MF = MBB.getParent();
    1477          32 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    1478             : 
    1479             :   // FIXME: Virtual register workaround for RegScavenger not working with empty
    1480             :   // blocks.
    1481          32 :   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    1482             : 
    1483          32 :   auto I = MBB.end();
    1484             : 
    1485             :   // We need to compute the offset relative to the instruction immediately after
    1486             :   // s_getpc_b64. Insert pc arithmetic code before last terminator.
    1487          64 :   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
    1488             : 
    1489             :   // TODO: Handle > 32-bit block address.
    1490          32 :   if (BrOffset >= 0) {
    1491          72 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
    1492          24 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1493          24 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1494             :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
    1495          72 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
    1496          24 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1497          24 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1498             :       .addImm(0);
    1499             :   } else {
    1500             :     // Backwards branch.
    1501          24 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
    1502           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1503           8 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1504             :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
    1505          24 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
    1506           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1507           8 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1508             :       .addImm(0);
    1509             :   }
    1510             : 
    1511             :   // Insert the indirect branch after the other terminator.
    1512          96 :   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
    1513          32 :     .addReg(PCReg);
    1514             : 
    1515             :   // FIXME: If spilling is necessary, this will fail because this scavenger has
    1516             :   // no emergency stack slots. It is non-trivial to spill in this situation,
    1517             :   // because the restore code needs to be specially placed after the
    1518             :   // jump. BranchRelaxation then needs to be made aware of the newly inserted
    1519             :   // block.
    1520             :   //
    1521             :   // If a spill is needed for the pc register pair, we need to insert a spill
    1522             :   // restore block right before the destination block, and insert a short branch
    1523             :   // into the old destination block's fallthrough predecessor.
    1524             :   // e.g.:
    1525             :   //
    1526             :   // s_cbranch_scc0 skip_long_branch:
    1527             :   //
    1528             :   // long_branch_bb:
    1529             :   //   spill s[8:9]
    1530             :   //   s_getpc_b64 s[8:9]
    1531             :   //   s_add_u32 s8, s8, restore_bb
    1532             :   //   s_addc_u32 s9, s9, 0
    1533             :   //   s_setpc_b64 s[8:9]
    1534             :   //
    1535             :   // skip_long_branch:
    1536             :   //   foo;
    1537             :   //
    1538             :   // .....
    1539             :   //
    1540             :   // dest_bb_fallthrough_predecessor:
    1541             :   // bar;
    1542             :   // s_branch dest_bb
    1543             :   //
    1544             :   // restore_bb:
    1545             :   //  restore s[8:9]
    1546             :   //  fallthrough dest_bb
    1547             :   ///
    1548             :   // dest_bb:
    1549             :   //   buzz;
    1550             : 
    1551          32 :   RS->enterBasicBlockEnd(MBB);
    1552          31 :   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
    1553          32 :                                        MachineBasicBlock::iterator(GetPC), 0);
    1554          31 :   MRI.replaceRegWith(PCReg, Scav);
    1555          31 :   MRI.clearVirtRegs();
    1556          31 :   RS->setRegUsed(Scav);
    1557             : 
    1558          31 :   return 4 + 8 + 4 + 4;
    1559             : }
    1560             : 
    1561        1622 : unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
    1562        1622 :   switch (Cond) {
    1563             :   case SIInstrInfo::SCC_TRUE:
    1564             :     return AMDGPU::S_CBRANCH_SCC1;
    1565         414 :   case SIInstrInfo::SCC_FALSE:
    1566         414 :     return AMDGPU::S_CBRANCH_SCC0;
    1567         296 :   case SIInstrInfo::VCCNZ:
    1568         296 :     return AMDGPU::S_CBRANCH_VCCNZ;
    1569         278 :   case SIInstrInfo::VCCZ:
    1570         278 :     return AMDGPU::S_CBRANCH_VCCZ;
    1571         124 :   case SIInstrInfo::EXECNZ:
    1572         124 :     return AMDGPU::S_CBRANCH_EXECNZ;
    1573          87 :   case SIInstrInfo::EXECZ:
    1574          87 :     return AMDGPU::S_CBRANCH_EXECZ;
    1575           0 :   default:
    1576           0 :     llvm_unreachable("invalid branch predicate");
    1577             :   }
    1578             : }
    1579             : 
    1580      857462 : SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
    1581             :   switch (Opcode) {
    1582             :   case AMDGPU::S_CBRANCH_SCC0:
    1583             :     return SCC_FALSE;
    1584             :   case AMDGPU::S_CBRANCH_SCC1:
    1585             :     return SCC_TRUE;
    1586             :   case AMDGPU::S_CBRANCH_VCCNZ:
    1587             :     return VCCNZ;
    1588             :   case AMDGPU::S_CBRANCH_VCCZ:
    1589             :     return VCCZ;
    1590             :   case AMDGPU::S_CBRANCH_EXECNZ:
    1591             :     return EXECNZ;
    1592             :   case AMDGPU::S_CBRANCH_EXECZ:
    1593             :     return EXECZ;
    1594             :   default:
    1595             :     return INVALID_BR;
    1596             :   }
    1597             : }
    1598             : 
    1599      886202 : bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
    1600             :                                     MachineBasicBlock::iterator I,
    1601             :                                     MachineBasicBlock *&TBB,
    1602             :                                     MachineBasicBlock *&FBB,
    1603             :                                     SmallVectorImpl<MachineOperand> &Cond,
    1604             :                                     bool AllowModify) const {
    1605     1772404 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1606             :     // Unconditional Branch
    1607       28740 :     TBB = I->getOperand(0).getMBB();
    1608       28740 :     return false;
    1609             :   }
    1610             : 
    1611             :   MachineBasicBlock *CondBB = nullptr;
    1612             : 
    1613      857462 :   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    1614           0 :     CondBB = I->getOperand(1).getMBB();
    1615           0 :     Cond.push_back(I->getOperand(0));
    1616             :   } else {
    1617      857462 :     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
    1618      857462 :     if (Pred == INVALID_BR)
    1619             :       return true;
    1620             : 
    1621       34794 :     CondBB = I->getOperand(0).getMBB();
    1622       69588 :     Cond.push_back(MachineOperand::CreateImm(Pred));
    1623       69588 :     Cond.push_back(I->getOperand(1)); // Save the branch register.
    1624             :   }
    1625             :   ++I;
    1626             : 
    1627       34794 :   if (I == MBB.end()) {
    1628             :     // Conditional branch followed by fall-through.
    1629       17672 :     TBB = CondBB;
    1630       17672 :     return false;
    1631             :   }
    1632             : 
    1633       34244 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1634       17119 :     TBB = CondBB;
    1635       17119 :     FBB = I->getOperand(0).getMBB();
    1636       17119 :     return false;
    1637             :   }
    1638             : 
    1639             :   return true;
    1640             : }
    1641             : 
    1642      932170 : bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
    1643             :                                 MachineBasicBlock *&FBB,
    1644             :                                 SmallVectorImpl<MachineOperand> &Cond,
    1645             :                                 bool AllowModify) const {
    1646      932170 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1647      932170 :   if (I == MBB.end())
    1648             :     return false;
    1649             : 
    1650     1774152 :   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
    1651      873725 :     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
    1652             : 
    1653             :   ++I;
    1654             : 
    1655             :   // TODO: Should be able to treat as fallthrough?
    1656       13351 :   if (I == MBB.end())
    1657             :     return true;
    1658             : 
    1659       12477 :   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
    1660             :     return true;
    1661             : 
    1662       12477 :   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
    1663             : 
    1664             :   // Specifically handle the case where the conditional branch is to the same
    1665             :   // destination as the mask branch. e.g.
    1666             :   //
    1667             :   // si_mask_branch BB8
    1668             :   // s_cbranch_execz BB8
    1669             :   // s_cbranch BB9
    1670             :   //
    1671             :   // This is required to understand divergent loops which may need the branches
    1672             :   // to be relaxed.
    1673       12477 :   if (TBB != MaskBrDest || Cond.empty())
    1674             :     return true;
    1675             : 
    1676         323 :   auto Pred = Cond[0].getImm();
    1677         323 :   return (Pred != EXECZ && Pred != EXECNZ);
    1678             : }
    1679             : 
    1680        2581 : unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
    1681             :                                    int *BytesRemoved) const {
    1682        2581 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1683             : 
    1684             :   unsigned Count = 0;
    1685             :   unsigned RemovedSize = 0;
    1686        5795 :   while (I != MBB.end()) {
    1687             :     MachineBasicBlock::iterator Next = std::next(I);
    1688        6428 :     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
    1689             :       I = Next;
    1690             :       continue;
    1691             :     }
    1692             : 
    1693        3208 :     RemovedSize += getInstSizeInBytes(*I);
    1694        3208 :     I->eraseFromParent();
    1695        3208 :     ++Count;
    1696             :     I = Next;
    1697             :   }
    1698             : 
    1699        2581 :   if (BytesRemoved)
    1700          30 :     *BytesRemoved = RemovedSize;
    1701             : 
    1702        2581 :   return Count;
    1703             : }
    1704             : 
    1705             : // Copy the flags onto the implicit condition register operand.
    1706             : static void preserveCondRegFlags(MachineOperand &CondReg,
    1707             :                                  const MachineOperand &OrigCond) {
    1708             :   CondReg.setIsUndef(OrigCond.isUndef());
    1709             :   CondReg.setIsKill(OrigCond.isKill());
    1710             : }
    1711             : 
    1712        2310 : unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
    1713             :                                    MachineBasicBlock *TBB,
    1714             :                                    MachineBasicBlock *FBB,
    1715             :                                    ArrayRef<MachineOperand> Cond,
    1716             :                                    const DebugLoc &DL,
    1717             :                                    int *BytesAdded) const {
    1718        2310 :   if (!FBB && Cond.empty()) {
    1719         688 :     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1720             :       .addMBB(TBB);
    1721         688 :     if (BytesAdded)
    1722           0 :       *BytesAdded = 4;
    1723             :     return 1;
    1724             :   }
    1725             : 
    1726        1622 :   if(Cond.size() == 1 && Cond[0].isReg()) {
    1727           0 :      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
    1728             :        .add(Cond[0])
    1729             :        .addMBB(TBB);
    1730           0 :      return 1;
    1731             :   }
    1732             : 
    1733             :   assert(TBB && Cond[0].isImm());
    1734             : 
    1735             :   unsigned Opcode
    1736        1622 :     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
    1737             : 
    1738        1622 :   if (!FBB) {
    1739             :     Cond[1].isUndef();
    1740             :     MachineInstr *CondBr =
    1741        1516 :       BuildMI(&MBB, DL, get(Opcode))
    1742             :       .addMBB(TBB);
    1743             : 
    1744             :     // Copy the flags onto the implicit condition register operand.
    1745        1516 :     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
    1746             : 
    1747        1516 :     if (BytesAdded)
    1748           0 :       *BytesAdded = 4;
    1749             :     return 1;
    1750             :   }
    1751             : 
    1752             :   assert(TBB && FBB);
    1753             : 
    1754             :   MachineInstr *CondBr =
    1755         106 :     BuildMI(&MBB, DL, get(Opcode))
    1756             :     .addMBB(TBB);
    1757         106 :   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1758             :     .addMBB(FBB);
    1759             : 
    1760         106 :   MachineOperand &CondReg = CondBr->getOperand(1);
    1761             :   CondReg.setIsUndef(Cond[1].isUndef());
    1762             :   CondReg.setIsKill(Cond[1].isKill());
    1763             : 
    1764         106 :   if (BytesAdded)
    1765          30 :       *BytesAdded = 8;
    1766             : 
    1767             :   return 2;
    1768             : }
    1769             : 
    1770        1340 : bool SIInstrInfo::reverseBranchCondition(
    1771             :   SmallVectorImpl<MachineOperand> &Cond) const {
    1772        1340 :   if (Cond.size() != 2) {
    1773             :     return true;
    1774             :   }
    1775             : 
    1776        1340 :   if (Cond[0].isImm()) {
    1777        1340 :     Cond[0].setImm(-Cond[0].getImm());
    1778        1340 :     return false;
    1779             :   }
    1780             : 
    1781             :   return true;
    1782             : }
    1783             : 
    1784          22 : bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
    1785             :                                   ArrayRef<MachineOperand> Cond,
    1786             :                                   unsigned TrueReg, unsigned FalseReg,
    1787             :                                   int &CondCycles,
    1788             :                                   int &TrueCycles, int &FalseCycles) const {
    1789          22 :   switch (Cond[0].getImm()) {
    1790          15 :   case VCCNZ:
    1791             :   case VCCZ: {
    1792          15 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1793             :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1794             :     assert(MRI.getRegClass(FalseReg) == RC);
    1795             : 
    1796          30 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1797          15 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1798             : 
    1799             :     // Limit to equal cost for branch vs. N v_cndmask_b32s.
    1800          30 :     return !RI.isSGPRClass(RC) && NumInsts <= 6;
    1801             :   }
    1802           7 :   case SCC_TRUE:
    1803             :   case SCC_FALSE: {
    1804             :     // FIXME: We could insert for VGPRs if we could replace the original compare
    1805             :     // with a vector one.
    1806           7 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1807             :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1808             :     assert(MRI.getRegClass(FalseReg) == RC);
    1809             : 
    1810          14 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1811             : 
    1812             :     // Multiples of 8 can do s_cselect_b64
    1813           7 :     if (NumInsts % 2 == 0)
    1814           3 :       NumInsts /= 2;
    1815             : 
    1816           7 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1817          14 :     return RI.isSGPRClass(RC);
    1818             :   }
    1819             :   default:
    1820             :     return false;
    1821             :   }
    1822             : }
    1823             : 
    1824          16 : void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
    1825             :                                MachineBasicBlock::iterator I, const DebugLoc &DL,
    1826             :                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
    1827             :                                unsigned TrueReg, unsigned FalseReg) const {
    1828          16 :   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
    1829          16 :   if (Pred == VCCZ || Pred == SCC_FALSE) {
    1830           0 :     Pred = static_cast<BranchPredicate>(-Pred);
    1831             :     std::swap(TrueReg, FalseReg);
    1832             :   }
    1833             : 
    1834          16 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1835             :   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
    1836             :   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
    1837             : 
    1838          16 :   if (DstSize == 32) {
    1839           9 :     unsigned SelOp = Pred == SCC_TRUE ?
    1840             :       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
    1841             : 
    1842             :     // Instruction's operands are backwards from what is expected.
    1843             :     MachineInstr *Select =
    1844          27 :       BuildMI(MBB, I, DL, get(SelOp), DstReg)
    1845           9 :       .addReg(FalseReg)
    1846           9 :       .addReg(TrueReg);
    1847             : 
    1848           9 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1849          10 :     return;
    1850             :   }
    1851             : 
    1852           7 :   if (DstSize == 64 && Pred == SCC_TRUE) {
    1853             :     MachineInstr *Select =
    1854           3 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
    1855           1 :       .addReg(FalseReg)
    1856           1 :       .addReg(TrueReg);
    1857             : 
    1858           1 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1859             :     return;
    1860             :   }
    1861             : 
    1862             :   static const int16_t Sub0_15[] = {
    1863             :     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    1864             :     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
    1865             :     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
    1866             :     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
    1867             :   };
    1868             : 
    1869             :   static const int16_t Sub0_15_64[] = {
    1870             :     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
    1871             :     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
    1872             :     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
    1873             :     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
    1874             :   };
    1875             : 
    1876             :   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
    1877             :   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
    1878             :   const int16_t *SubIndices = Sub0_15;
    1879           6 :   int NElts = DstSize / 32;
    1880             : 
    1881             :   // 64-bit select is only avaialble for SALU.
    1882           6 :   if (Pred == SCC_TRUE) {
    1883             :     SelOp = AMDGPU::S_CSELECT_B64;
    1884             :     EltRC = &AMDGPU::SGPR_64RegClass;
    1885             :     SubIndices = Sub0_15_64;
    1886             : 
    1887             :     assert(NElts % 2 == 0);
    1888           2 :     NElts /= 2;
    1889             :   }
    1890             : 
    1891             :   MachineInstrBuilder MIB = BuildMI(
    1892          12 :     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
    1893             : 
    1894           6 :   I = MIB->getIterator();
    1895             : 
    1896             :   SmallVector<unsigned, 8> Regs;
    1897          38 :   for (int Idx = 0; Idx != NElts; ++Idx) {
    1898          16 :     unsigned DstElt = MRI.createVirtualRegister(EltRC);
    1899          16 :     Regs.push_back(DstElt);
    1900             : 
    1901          16 :     unsigned SubIdx = SubIndices[Idx];
    1902             : 
    1903             :     MachineInstr *Select =
    1904          48 :       BuildMI(MBB, I, DL, get(SelOp), DstElt)
    1905          16 :       .addReg(FalseReg, 0, SubIdx)
    1906          16 :       .addReg(TrueReg, 0, SubIdx);
    1907          16 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1908             : 
    1909          16 :     MIB.addReg(DstElt)
    1910          32 :        .addImm(SubIdx);
    1911             :   }
    1912             : }
    1913             : 
    1914      881299 : bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
    1915     1762598 :   switch (MI.getOpcode()) {
    1916       25820 :   case AMDGPU::V_MOV_B32_e32:
    1917             :   case AMDGPU::V_MOV_B32_e64:
    1918             :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1919             :     // If there are additional implicit register operands, this may be used for
    1920             :     // register indexing so the source register operand isn't simply copied.
    1921       25820 :     unsigned NumOps = MI.getDesc().getNumOperands() +
    1922       51640 :       MI.getDesc().getNumImplicitUses();
    1923             : 
    1924       25820 :     return MI.getNumOperands() == NumOps;
    1925             :   }
    1926             :   case AMDGPU::S_MOV_B32:
    1927             :   case AMDGPU::S_MOV_B64:
    1928             :   case AMDGPU::COPY:
    1929             :     return true;
    1930      490728 :   default:
    1931      490728 :     return false;
    1932             :   }
    1933             : }
    1934             : 
    1935       77926 : unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
    1936             :     PseudoSourceValue::PSVKind Kind) const {
    1937             :   switch(Kind) {
    1938       20741 :   case PseudoSourceValue::Stack:
    1939             :   case PseudoSourceValue::FixedStack:
    1940       41482 :     return ST.getAMDGPUAS().PRIVATE_ADDRESS;
    1941             :   case PseudoSourceValue::ConstantPool:
    1942             :   case PseudoSourceValue::GOT:
    1943             :   case PseudoSourceValue::JumpTable:
    1944             :   case PseudoSourceValue::GlobalValueCallEntry:
    1945             :   case PseudoSourceValue::ExternalSymbolCallEntry:
    1946             :   case PseudoSourceValue::TargetCustom:
    1947             :     return ST.getAMDGPUAS().CONSTANT_ADDRESS;
    1948             :   }
    1949           0 :   return ST.getAMDGPUAS().FLAT_ADDRESS;
    1950             : }
    1951             : 
    1952          28 : static void removeModOperands(MachineInstr &MI) {
    1953          28 :   unsigned Opc = MI.getOpcode();
    1954          28 :   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1955             :                                               AMDGPU::OpName::src0_modifiers);
    1956          28 :   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1957             :                                               AMDGPU::OpName::src1_modifiers);
    1958          28 :   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1959             :                                               AMDGPU::OpName::src2_modifiers);
    1960             : 
    1961          28 :   MI.RemoveOperand(Src2ModIdx);
    1962          28 :   MI.RemoveOperand(Src1ModIdx);
    1963          28 :   MI.RemoveOperand(Src0ModIdx);
    1964          28 : }
    1965             : 
    1966       53296 : bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
    1967             :                                 unsigned Reg, MachineRegisterInfo *MRI) const {
    1968       53296 :   if (!MRI->hasOneNonDBGUse(Reg))
    1969             :     return false;
    1970             : 
    1971       43062 :   switch (DefMI.getOpcode()) {
    1972             :   default:
    1973             :     return false;
    1974             :   case AMDGPU::S_MOV_B64:
    1975             :     // TODO: We could fold 64-bit immediates, but this get compilicated
    1976             :     // when there are sub-registers.
    1977             :     return false;
    1978             : 
    1979             :   case AMDGPU::V_MOV_B32_e32:
    1980             :   case AMDGPU::S_MOV_B32:
    1981             :     break;
    1982             :   }
    1983             : 
    1984       20938 :   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
    1985             :   assert(ImmOp);
    1986             :   // FIXME: We could handle FrameIndex values here.
    1987       20938 :   if (!ImmOp->isImm())
    1988             :     return false;
    1989             : 
    1990       20563 :   unsigned Opc = UseMI.getOpcode();
    1991       20563 :   if (Opc == AMDGPU::COPY) {
    1992        3899 :     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
    1993        3899 :     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
    1994        3899 :     UseMI.setDesc(get(NewOpc));
    1995        7798 :     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
    1996        3899 :     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
    1997        3899 :     return true;
    1998             :   }
    1999             : 
    2000       16664 :   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
    2001       16461 :       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
    2002             :     // Don't fold if we are using source or output modifiers. The new VOP2
    2003             :     // instructions don't have them.
    2004         240 :     if (hasAnyModifiersSet(UseMI))
    2005             :       return false;
    2006             : 
    2007             :     // If this is a free constant, there's no reason to do this.
    2008             :     // TODO: We could fold this here instead of letting SIFoldOperands do it
    2009             :     // later.
    2010         205 :     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
    2011             : 
    2012             :     // Any src operand can be used for the legality check.
    2013         205 :     if (isInlineConstant(UseMI, *Src0, *ImmOp))
    2014             :       return false;
    2015             : 
    2016             :     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
    2017         107 :     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
    2018         107 :     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
    2019             : 
    2020             :     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
    2021             :     // We should only expect these to be on src0 due to canonicalizations.
    2022         107 :     if (Src0->isReg() && Src0->getReg() == Reg) {
    2023          21 :       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
    2024             :         return false;
    2025             : 
    2026           6 :       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
    2027             :         return false;
    2028             : 
    2029             :       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
    2030             : 
    2031           3 :       const int64_t Imm = ImmOp->getImm();
    2032             : 
    2033             :       // FIXME: This would be a lot easier if we could return a new instruction
    2034             :       // instead of having to modify in place.
    2035             : 
    2036             :       // Remove these first since they are at the end.
    2037           3 :       UseMI.RemoveOperand(
    2038           3 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    2039           3 :       UseMI.RemoveOperand(
    2040           3 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    2041             : 
    2042           3 :       unsigned Src1Reg = Src1->getReg();
    2043             :       unsigned Src1SubReg = Src1->getSubReg();
    2044           3 :       Src0->setReg(Src1Reg);
    2045             :       Src0->setSubReg(Src1SubReg);
    2046             :       Src0->setIsKill(Src1->isKill());
    2047             : 
    2048           3 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    2049           3 :           Opc == AMDGPU::V_MAC_F16_e64)
    2050           0 :         UseMI.untieRegOperand(
    2051           0 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    2052             : 
    2053           3 :       Src1->ChangeToImmediate(Imm);
    2054             : 
    2055           3 :       removeModOperands(UseMI);
    2056           3 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
    2057             : 
    2058           3 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    2059           3 :       if (DeleteDef)
    2060           0 :         DefMI.eraseFromParent();
    2061             : 
    2062             :       return true;
    2063             :     }
    2064             : 
    2065             :     // Added part is the constant: Use v_madak_{f16, f32}.
    2066         200 :     if (Src2->isReg() && Src2->getReg() == Reg) {
    2067             :       // Not allowed to use constant bus for another operand.
    2068             :       // We can however allow an inline immediate as src0.
    2069          38 :       if (!Src0->isImm() &&
    2070          76 :           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
    2071             :         return false;
    2072             : 
    2073          56 :       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
    2074             :         return false;
    2075             : 
    2076          25 :       const int64_t Imm = ImmOp->getImm();
    2077             : 
    2078             :       // FIXME: This would be a lot easier if we could return a new instruction
    2079             :       // instead of having to modify in place.
    2080             : 
    2081             :       // Remove these first since they are at the end.
    2082          25 :       UseMI.RemoveOperand(
    2083          25 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    2084          25 :       UseMI.RemoveOperand(
    2085          25 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    2086             : 
    2087          25 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    2088          25 :           Opc == AMDGPU::V_MAC_F16_e64)
    2089          22 :         UseMI.untieRegOperand(
    2090          22 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    2091             : 
    2092             :       // ChangingToImmediate adds Src2 back to the instruction.
    2093          25 :       Src2->ChangeToImmediate(Imm);
    2094             : 
    2095             :       // These come before src2.
    2096          25 :       removeModOperands(UseMI);
    2097          25 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
    2098             : 
    2099          25 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    2100          25 :       if (DeleteDef)
    2101           0 :         DefMI.eraseFromParent();
    2102             : 
    2103             :       return true;
    2104             :     }
    2105             :   }
    2106             : 
    2107             :   return false;
    2108             : }
    2109             : 
    2110             : static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
    2111             :                                 int WidthB, int OffsetB) {
    2112       22079 :   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
    2113       22079 :   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
    2114       22079 :   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
    2115       22079 :   return LowOffset + LowWidth <= HighOffset;
    2116             : }
    2117             : 
    2118      875564 : bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
    2119             :                                                MachineInstr &MIb) const {
    2120             :   unsigned BaseReg0, BaseReg1;
    2121             :   int64_t Offset0, Offset1;
    2122             : 
    2123      960792 :   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
    2124       85228 :       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
    2125             : 
    2126       80646 :     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
    2127             :       // FIXME: Handle ds_read2 / ds_write2.
    2128             :       return false;
    2129             :     }
    2130       58040 :     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
    2131       58040 :     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
    2132       80119 :     if (BaseReg0 == BaseReg1 &&
    2133       22079 :         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
    2134             :       return true;
    2135             :     }
    2136             :   }
    2137             : 
    2138             :   return false;
    2139             : }
    2140             : 
    2141      961046 : bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
    2142             :                                                   MachineInstr &MIb,
    2143             :                                                   AliasAnalysis *AA) const {
    2144             :   assert((MIa.mayLoad() || MIa.mayStore()) &&
    2145             :          "MIa must load from or modify a memory location");
    2146             :   assert((MIb.mayLoad() || MIb.mayStore()) &&
    2147             :          "MIb must load from or modify a memory location");
    2148             : 
    2149      961046 :   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
    2150             :     return false;
    2151             : 
    2152             :   // XXX - Can we relax this between address spaces?
    2153      961046 :   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
    2154             :     return false;
    2155             : 
    2156      960894 :   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
    2157        2553 :     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
    2158        2553 :     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
    2159        4415 :     if (MMOa->getValue() && MMOb->getValue()) {
    2160        1645 :       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
    2161        1645 :       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
    2162        1645 :       if (!AA->alias(LocA, LocB))
    2163        1155 :         return true;
    2164             :     }
    2165             :   }
    2166             : 
    2167             :   // TODO: Should we check the address space from the MachineMemOperand? That
    2168             :   // would allow us to distinguish objects we know don't alias based on the
    2169             :   // underlying address space, even if it was lowered to a different one,
    2170             :   // e.g. private accesses lowered to use MUBUF instructions on a scratch
    2171             :   // buffer.
    2172      959739 :   if (isDS(MIa)) {
    2173      117552 :     if (isDS(MIb))
    2174       63160 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2175             : 
    2176       95775 :     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
    2177             :   }
    2178             : 
    2179      888012 :   if (isMUBUF(MIa) || isMTBUF(MIa)) {
    2180      810181 :     if (isMUBUF(MIb) || isMTBUF(MIb))
    2181      782651 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2182             : 
    2183       15700 :     return !isFLAT(MIb) && !isSMRD(MIb);
    2184             :   }
    2185             : 
    2186       45787 :   if (isSMRD(MIa)) {
    2187        2353 :     if (isSMRD(MIb))
    2188           0 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2189             : 
    2190        2353 :     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
    2191             :   }
    2192             : 
    2193       43434 :   if (isFLAT(MIa)) {
    2194       43375 :     if (isFLAT(MIb))
    2195       29753 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2196             : 
    2197             :     return false;
    2198             :   }
    2199             : 
    2200             :   return false;
    2201             : }
    2202             : 
    2203         780 : static int64_t getFoldableImm(const MachineOperand* MO) {
    2204         780 :   if (!MO->isReg())
    2205             :     return false;
    2206         778 :   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
    2207         778 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2208         778 :   auto Def = MRI.getUniqueVRegDef(MO->getReg());
    2209        1570 :   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
    2210          23 :       Def->getOperand(1).isImm())
    2211          23 :     return Def->getOperand(1).getImm();
    2212             :   return AMDGPU::NoRegister;
    2213             : }
    2214             : 
    2215         307 : MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
    2216             :                                                  MachineInstr &MI,
    2217             :                                                  LiveVariables *LV) const {
    2218         307 :   unsigned Opc = MI.getOpcode();
    2219             :   bool IsF16 = false;
    2220         307 :   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
    2221             : 
    2222         307 :   switch (Opc) {
    2223             :   default:
    2224             :     return nullptr;
    2225           0 :   case AMDGPU::V_MAC_F16_e64:
    2226             :     IsF16 = true;
    2227             :     LLVM_FALLTHROUGH;
    2228             :   case AMDGPU::V_MAC_F32_e64:
    2229             :   case AMDGPU::V_FMAC_F32_e64:
    2230             :     break;
    2231           6 :   case AMDGPU::V_MAC_F16_e32:
    2232             :     IsF16 = true;
    2233             :     LLVM_FALLTHROUGH;
    2234         293 :   case AMDGPU::V_MAC_F32_e32:
    2235             :   case AMDGPU::V_FMAC_F32_e32: {
    2236         293 :     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
    2237             :                                              AMDGPU::OpName::src0);
    2238         293 :     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
    2239         293 :     if (!Src0->isReg() && !Src0->isImm())
    2240             :       return nullptr;
    2241             : 
    2242         292 :     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
    2243             :       return nullptr;
    2244             : 
    2245             :     break;
    2246             :   }
    2247             :   }
    2248             : 
    2249         303 :   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    2250         303 :   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
    2251         303 :   const MachineOperand *Src0Mods =
    2252             :     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
    2253         303 :   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
    2254         303 :   const MachineOperand *Src1Mods =
    2255             :     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
    2256         303 :   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
    2257         303 :   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2258         303 :   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2259             : 
    2260         587 :   if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
    2261             :       // If we have an SGPR input, we will violate the constant bus restriction.
    2262         562 :       (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
    2263         271 :     if (auto Imm = getFoldableImm(Src2)) {
    2264          12 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2265          12 :                      get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
    2266             :                .add(*Dst)
    2267             :                .add(*Src0)
    2268             :                .add(*Src1)
    2269             :                .addImm(Imm);
    2270             :     }
    2271         259 :     if (auto Imm = getFoldableImm(Src1)) {
    2272           9 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2273           9 :                      get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2274             :                .add(*Dst)
    2275             :                .add(*Src0)
    2276             :                .addImm(Imm)
    2277             :                .add(*Src2);
    2278             :     }
    2279         250 :     if (auto Imm = getFoldableImm(Src0)) {
    2280           2 :       if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
    2281             :                            AMDGPU::OpName::src0), Src1))
    2282           2 :         return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2283           2 :                        get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2284             :                  .add(*Dst)
    2285             :                  .add(*Src1)
    2286             :                  .addImm(Imm)
    2287             :                  .add(*Src2);
    2288             :     }
    2289             :   }
    2290             : 
    2291             :   assert((!IsFMA || !IsF16) && "fmac only expected with f32");
    2292         280 :   unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
    2293             :     (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
    2294         560 :   return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
    2295             :       .add(*Dst)
    2296         280 :       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
    2297             :       .add(*Src0)
    2298         280 :       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
    2299             :       .add(*Src1)
    2300             :       .addImm(0) // Src mods
    2301             :       .add(*Src2)
    2302         280 :       .addImm(Clamp ? Clamp->getImm() : 0)
    2303         280 :       .addImm(Omod ? Omod->getImm() : 0);
    2304             : }
    2305             : 
    2306             : // It's not generally safe to move VALU instructions across these since it will
    2307             : // start using the register as a base index rather than directly.
    2308             : // XXX - Why isn't hasSideEffects sufficient for these?
    2309             : static bool changesVGPRIndexingMode(const MachineInstr &MI) {
    2310      481341 :   switch (MI.getOpcode()) {
    2311             :   case AMDGPU::S_SET_GPR_IDX_ON:
    2312             :   case AMDGPU::S_SET_GPR_IDX_MODE:
    2313             :   case AMDGPU::S_SET_GPR_IDX_OFF:
    2314             :     return true;
    2315             :   default:
    2316             :     return false;
    2317             :   }
    2318             : }
    2319             : 
    2320      519471 : bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
    2321             :                                        const MachineBasicBlock *MBB,
    2322             :                                        const MachineFunction &MF) const {
    2323             :   // XXX - Do we want the SP check in the base implementation?
    2324             : 
    2325             :   // Target-independent instructions do not have an implicit-use of EXEC, even
    2326             :   // when they operate on VGPRs. Treating EXEC modifications as scheduling
    2327             :   // boundaries prevents incorrect movements of such instructions.
    2328     1003538 :   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
    2329      965570 :          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
    2330      962978 :          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
    2331      519471 :          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
    2332      519471 :          changesVGPRIndexingMode(MI);
    2333             : }
    2334             : 
    2335        5508 : bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
    2336        5508 :   switch (Imm.getBitWidth()) {
    2337          34 :   case 32:
    2338          68 :     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
    2339          68 :                                         ST.hasInv2PiInlineImm());
    2340        5364 :   case 64:
    2341        5364 :     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
    2342       10728 :                                         ST.hasInv2PiInlineImm());
    2343         110 :   case 16:
    2344         220 :     return ST.has16BitInsts() &&
    2345         220 :            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
    2346         110 :                                         ST.hasInv2PiInlineImm());
    2347           0 :   default:
    2348           0 :     llvm_unreachable("invalid bitwidth");
    2349             :   }
    2350             : }
    2351             : 
    2352     4558717 : bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
    2353             :                                    uint8_t OperandType) const {
    2354             :   if (!MO.isImm() ||
    2355     4558717 :       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
    2356             :       OperandType > AMDGPU::OPERAND_SRC_LAST)
    2357             :     return false;
    2358             : 
    2359             :   // MachineOperand provides no way to tell the true operand size, since it only
    2360             :   // records a 64-bit value. We need to know the size to determine if a 32-bit
    2361             :   // floating point immediate bit pattern is legal for an integer immediate. It
    2362             :   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
    2363             : 
    2364     4509502 :   int64_t Imm = MO.getImm();
    2365     4509502 :   switch (OperandType) {
    2366     4288170 :   case AMDGPU::OPERAND_REG_IMM_INT32:
    2367             :   case AMDGPU::OPERAND_REG_IMM_FP32:
    2368             :   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2369             :   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
    2370     4288170 :     int32_t Trunc = static_cast<int32_t>(Imm);
    2371     8574855 :     return Trunc == Imm &&
    2372     4286685 :            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
    2373             :   }
    2374       46849 :   case AMDGPU::OPERAND_REG_IMM_INT64:
    2375             :   case AMDGPU::OPERAND_REG_IMM_FP64:
    2376             :   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2377             :   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2378       46849 :     return AMDGPU::isInlinableLiteral64(MO.getImm(),
    2379       93698 :                                         ST.hasInv2PiInlineImm());
    2380             :   case AMDGPU::OPERAND_REG_IMM_INT16:
    2381             :   case AMDGPU::OPERAND_REG_IMM_FP16:
    2382             :   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2383             :   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2384      168819 :     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
    2385             :       // A few special case instructions have 16-bit operands on subtargets
    2386             :       // where 16-bit instructions are not legal.
    2387             :       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
    2388             :       // constants in these cases
    2389             :       int16_t Trunc = static_cast<int16_t>(Imm);
    2390      337388 :       return ST.has16BitInsts() &&
    2391      168692 :              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
    2392             :     }
    2393             : 
    2394             :     return false;
    2395             :   }
    2396        5664 :   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
    2397             :   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
    2398        5664 :     if (isUInt<16>(Imm)) {
    2399         857 :       int16_t Trunc = static_cast<int16_t>(Imm);
    2400        1714 :       return ST.has16BitInsts() &&
    2401         857 :              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
    2402             :     }
    2403        4807 :     if (!(Imm & 0xffff)) {
    2404          50 :       return ST.has16BitInsts() &&
    2405          25 :              AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
    2406             :     }
    2407             :     uint32_t Trunc = static_cast<uint32_t>(Imm);
    2408        4782 :     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
    2409             :   }
    2410           0 :   default:
    2411           0 :     llvm_unreachable("invalid bitwidth");
    2412             :   }
    2413             : }
    2414             : 
    2415      466162 : bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
    2416             :                                         const MCOperandInfo &OpInfo) const {
    2417             :   switch (MO.getType()) {
    2418             :   case MachineOperand::MO_Register:
    2419             :     return false;
    2420      141601 :   case MachineOperand::MO_Immediate:
    2421      283202 :     return !isInlineConstant(MO, OpInfo);
    2422             :   case MachineOperand::MO_FrameIndex:
    2423             :   case MachineOperand::MO_MachineBasicBlock:
    2424             :   case MachineOperand::MO_ExternalSymbol:
    2425             :   case MachineOperand::MO_GlobalAddress:
    2426             :   case MachineOperand::MO_MCSymbol:
    2427             :     return true;
    2428           0 :   default:
    2429           0 :     llvm_unreachable("unexpected operand type");
    2430             :   }
    2431             : }
    2432             : 
    2433             : static bool compareMachineOp(const MachineOperand &Op0,
    2434             :                              const MachineOperand &Op1) {
    2435             :   if (Op0.getType() != Op1.getType())
    2436             :     return false;
    2437             : 
    2438             :   switch (Op0.getType()) {
    2439        5628 :   case MachineOperand::MO_Register:
    2440       17098 :     return Op0.getReg() == Op1.getReg();
    2441             :   case MachineOperand::MO_Immediate:
    2442             :     return Op0.getImm() == Op1.getImm();
    2443             :   default:
    2444             :     llvm_unreachable("Didn't expect to be comparing these operand types");
    2445             :   }
    2446             : }
    2447             : 
    2448       85350 : bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
    2449             :                                     const MachineOperand &MO) const {
    2450      256050 :   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
    2451             : 
    2452             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    2453             : 
    2454       85350 :   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
    2455             :     return true;
    2456             : 
    2457       85350 :   if (OpInfo.RegClass < 0)
    2458             :     return false;
    2459             : 
    2460      170351 :   if (MO.isImm() && isInlineConstant(MO, OpInfo))
    2461      117238 :     return RI.opCanUseInlineConstant(OpInfo.OperandType);
    2462             : 
    2463       53462 :   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
    2464             : }
    2465             : 
    2466      705937 : bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
    2467      705937 :   int Op32 = AMDGPU::getVOPe32(Opcode);
    2468      705937 :   if (Op32 == -1)
    2469             :     return false;
    2470             : 
    2471      120217 :   return pseudoToMCOpcode(Op32) != -1;
    2472             : }
    2473             : 
    2474           0 : bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
    2475             :   // The src0_modifier operand is present on all instructions
    2476             :   // that have modifiers.
    2477             : 
    2478           0 :   return AMDGPU::getNamedOperandIdx(Opcode,
    2479           0 :                                     AMDGPU::OpName::src0_modifiers) != -1;
    2480             : }
    2481             : 
    2482      199872 : bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
    2483             :                                   unsigned OpName) const {
    2484             :   const MachineOperand *Mods = getNamedOperand(MI, OpName);
    2485      199872 :   return Mods && Mods->getImm();
    2486             : }
    2487             : 
    2488         240 : bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
    2489         470 :   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
    2490         457 :          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
    2491         436 :          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
    2492         656 :          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
    2493         447 :          hasModifiersSet(MI, AMDGPU::OpName::omod);
    2494             : }
    2495             : 
    2496     7805668 : bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
    2497             :                                   const MachineOperand &MO,
    2498             :                                   const MCOperandInfo &OpInfo) const {
    2499             :   // Literal constants use the constant bus.
    2500             :   //if (isLiteralConstantLike(MO, OpInfo))
    2501             :   // return true;
    2502     7805668 :   if (MO.isImm())
    2503     4307208 :     return !isInlineConstant(MO, OpInfo);
    2504             : 
    2505     5652064 :   if (!MO.isReg())
    2506             :     return true; // Misc other operands like FrameIndex
    2507             : 
    2508     5641839 :   if (!MO.isUse())
    2509             :     return false;
    2510             : 
    2511    11005240 :   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    2512     5460868 :     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
    2513             : 
    2514             :   // FLAT_SCR is just an SGPR pair.
    2515     2772186 :   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
    2516             :     return true;
    2517             : 
    2518             :   // EXEC register uses the constant bus.
    2519     2772186 :   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
    2520             :     return true;
    2521             : 
    2522             :   // SGPRs use the constant bus
    2523     2772186 :   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
    2524      665060 :           (!MO.isImplicit() &&
    2525     4404107 :            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
    2526     1918205 :             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
    2527             : }
    2528             : 
    2529     3833975 : static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
    2530    12758975 :   for (const MachineOperand &MO : MI.implicit_operands()) {
    2531             :     // We only care about reads.
    2532     4516826 :     if (MO.isDef())
    2533      249115 :       continue;
    2534             : 
    2535     4267711 :     switch (MO.getReg()) {
    2536             :     case AMDGPU::VCC:
    2537             :     case AMDGPU::M0:
    2538             :     case AMDGPU::FLAT_SCR:
    2539             :       return MO.getReg();
    2540             : 
    2541             :     default:
    2542             :       break;
    2543             :     }
    2544             :   }
    2545             : 
    2546             :   return AMDGPU::NoRegister;
    2547             : }
    2548             : 
    2549    10471723 : static bool shouldReadExec(const MachineInstr &MI) {
    2550    10471723 :   if (SIInstrInfo::isVALU(MI)) {
    2551     3810785 :     switch (MI.getOpcode()) {
    2552             :     case AMDGPU::V_READLANE_B32:
    2553             :     case AMDGPU::V_READLANE_B32_si:
    2554             :     case AMDGPU::V_READLANE_B32_vi:
    2555             :     case AMDGPU::V_WRITELANE_B32:
    2556             :     case AMDGPU::V_WRITELANE_B32_si:
    2557             :     case AMDGPU::V_WRITELANE_B32_vi:
    2558             :       return false;
    2559             :     }
    2560             : 
    2561     3780007 :     return true;
    2562             :   }
    2563             : 
    2564     6660938 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
    2565     9875071 :       SIInstrInfo::isSALU(MI) ||
    2566             :       SIInstrInfo::isSMRD(MI))
    2567             :     return false;
    2568             : 
    2569             :   return true;
    2570             : }
    2571             : 
    2572        2970 : static bool isSubRegOf(const SIRegisterInfo &TRI,
    2573             :                        const MachineOperand &SuperVec,
    2574             :                        const MachineOperand &SubReg) {
    2575        5940 :   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
    2576        3812 :     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
    2577             : 
    2578        2128 :   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
    2579        1064 :          SubReg.getReg() == SuperVec.getReg();
    2580             : }
    2581             : 
    2582    15434652 : bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
    2583             :                                     StringRef &ErrInfo) const {
    2584    15434652 :   uint16_t Opcode = MI.getOpcode();
    2585    15434652 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
    2586             :     return true;
    2587             : 
    2588    10471723 :   const MachineFunction *MF = MI.getParent()->getParent();
    2589    10471723 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2590             : 
    2591    10471723 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
    2592    10471723 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
    2593    10471723 :   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
    2594             : 
    2595             :   // Make sure the number of operands is correct.
    2596    10471723 :   const MCInstrDesc &Desc = get(Opcode);
    2597    31379962 :   if (!Desc.isVariadic() &&
    2598    20873032 :       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
    2599           0 :     ErrInfo = "Instruction has wrong number of operands.";
    2600           0 :     return false;
    2601             :   }
    2602             : 
    2603    10471723 :   if (MI.isInlineAsm()) {
    2604             :     // Verify register classes for inlineasm constraints.
    2605           0 :     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
    2606           0 :          I != E; ++I) {
    2607           0 :       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
    2608           0 :       if (!RC)
    2609           0 :         continue;
    2610             : 
    2611           0 :       const MachineOperand &Op = MI.getOperand(I);
    2612           0 :       if (!Op.isReg())
    2613           0 :         continue;
    2614             : 
    2615           0 :       unsigned Reg = Op.getReg();
    2616           0 :       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
    2617           0 :         ErrInfo = "inlineasm operand has incorrect register class.";
    2618           0 :         return false;
    2619             :       }
    2620             :     }
    2621             : 
    2622             :     return true;
    2623             :   }
    2624             : 
    2625             :   // Make sure the register classes are correct.
    2626    47735330 :   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
    2627    74527214 :     if (MI.getOperand(i).isFPImm()) {
    2628           0 :       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
    2629             :                 "all fp values to integers.";
    2630           0 :       return false;
    2631             :     }
    2632             : 
    2633    37263607 :     int RegClass = Desc.OpInfo[i].RegClass;
    2634             : 
    2635    37263607 :     switch (Desc.OpInfo[i].OperandType) {
    2636    14979093 :     case MCOI::OPERAND_REGISTER:
    2637    14979093 :       if (MI.getOperand(i).isImm()) {
    2638           0 :         ErrInfo = "Illegal immediate value for operand.";
    2639           0 :         return false;
    2640             :       }
    2641             :       break;
    2642             :     case AMDGPU::OPERAND_REG_IMM_INT32:
    2643             :     case AMDGPU::OPERAND_REG_IMM_FP32:
    2644             :       break;
    2645     5018702 :     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2646             :     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
    2647             :     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2648             :     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2649             :     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2650             :     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2651             :       const MachineOperand &MO = MI.getOperand(i);
    2652     7068236 :       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
    2653           0 :         ErrInfo = "Illegal immediate value for operand.";
    2654           0 :         return false;
    2655             :       }
    2656             :       break;
    2657             :     }
    2658    10907096 :     case MCOI::OPERAND_IMMEDIATE:
    2659             :     case AMDGPU::OPERAND_KIMM32:
    2660             :       // Check if this operand is an immediate.
    2661             :       // FrameIndex operands will be replaced by immediates, so they are
    2662             :       // allowed.
    2663    10907096 :       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
    2664           0 :         ErrInfo = "Expected immediate, but got non-immediate";
    2665           0 :         return false;
    2666             :       }
    2667             :       LLVM_FALLTHROUGH;
    2668             :     default:
    2669    12641145 :       continue;
    2670             :     }
    2671             : 
    2672    49244924 :     if (!MI.getOperand(i).isReg())
    2673     4763835 :       continue;
    2674             : 
    2675    19858627 :     if (RegClass != -1) {
    2676    19858627 :       unsigned Reg = MI.getOperand(i).getReg();
    2677    48742944 :       if (Reg == AMDGPU::NoRegister ||
    2678             :           TargetRegisterInfo::isVirtualRegister(Reg))
    2679     9025690 :         continue;
    2680             : 
    2681    10832937 :       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
    2682    21665874 :       if (!RC->contains(Reg)) {
    2683           0 :         ErrInfo = "Operand has incorrect register class.";
    2684           0 :         return false;
    2685             :       }
    2686             :     }
    2687             :   }
    2688             : 
    2689             :   // Verify SDWA
    2690    10471723 :   if (isSDWA(MI)) {
    2691       49790 :     if (!ST.hasSDWA()) {
    2692           0 :       ErrInfo = "SDWA is not supported on this target";
    2693           0 :       return false;
    2694             :     }
    2695             : 
    2696       49790 :     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
    2697             : 
    2698       49790 :     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
    2699             : 
    2700      448110 :     for (int OpIdx: OpIndicies) {
    2701      199160 :       if (OpIdx == -1)
    2702       57723 :         continue;
    2703      141437 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2704             : 
    2705      141437 :       if (!ST.hasSDWAScalar()) {
    2706             :         // Only VGPRS on VI
    2707      118320 :         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
    2708           0 :           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
    2709           0 :           return false;
    2710             :         }
    2711             :       } else {
    2712             :         // No immediates on GFX9
    2713       23117 :         if (!MO.isReg()) {
    2714           0 :           ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
    2715           0 :           return false;
    2716             :         }
    2717             :       }
    2718             :     }
    2719             : 
    2720       49790 :     if (!ST.hasSDWAOmod()) {
    2721             :       // No omod allowed on VI
    2722             :       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2723       54897 :       if (OMod != nullptr &&
    2724       12950 :         (!OMod->isImm() || OMod->getImm() != 0)) {
    2725           0 :         ErrInfo = "OMod not allowed in SDWA instructions on VI";
    2726           0 :         return false;
    2727             :       }
    2728             :     }
    2729             : 
    2730       49790 :     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
    2731       49790 :     if (isVOPC(BasicOpcode)) {
    2732          63 :       if (!ST.hasSDWASdst() && DstIdx != -1) {
    2733             :         // Only vcc allowed as dst on VI for VOPC
    2734           0 :         const MachineOperand &Dst = MI.getOperand(DstIdx);
    2735           0 :         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
    2736           0 :           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
    2737           0 :           return false;
    2738             :         }
    2739          63 :       } else if (!ST.hasSDWAOutModsVOPC()) {
    2740             :         // No clamp allowed on GFX9 for VOPC
    2741             :         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2742         100 :         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
    2743           0 :           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
    2744           0 :           return false;
    2745             :         }
    2746             : 
    2747             :         // No omod allowed on GFX9 for VOPC
    2748             :         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2749          50 :         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
    2750           0 :           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
    2751           0 :           return false;
    2752             :         }
    2753             :       }
    2754             :     }
    2755             : 
    2756             :     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
    2757      149244 :     if (DstUnused && DstUnused->isImm() &&
    2758       49727 :         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
    2759         236 :       const MachineOperand &Dst = MI.getOperand(DstIdx);
    2760         472 :       if (!Dst.isReg() || !Dst.isTied()) {
    2761           0 :         ErrInfo = "Dst register should have tied register";
    2762           0 :         return false;
    2763             :       }
    2764             : 
    2765             :       const MachineOperand &TiedMO =
    2766         236 :           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
    2767         708 :       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
    2768           0 :         ErrInfo =
    2769             :             "Dst register should be tied to implicit use of preserved register";
    2770           0 :         return false;
    2771         616 :       } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
    2772         144 :                  Dst.getReg() != TiedMO.getReg()) {
    2773           0 :         ErrInfo = "Dst register should use same physical register as preserved";
    2774           0 :         return false;
    2775             :       }
    2776             :     }
    2777             :   }
    2778             : 
    2779             :   // Verify VOP*. Ignore multiple sgpr operands on writelane.
    2780    10471723 :   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
    2781    52342687 :       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
    2782             :     // Only look at the true operands. Only a real operand can use the constant
    2783             :     // bus, and we don't want to check pseudo-operands like the source modifier
    2784             :     // flags.
    2785     3765847 :     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
    2786             : 
    2787             :     unsigned ConstantBusCount = 0;
    2788             :     unsigned LiteralCount = 0;
    2789             : 
    2790     3765847 :     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
    2791             :       ++ConstantBusCount;
    2792             : 
    2793     3765847 :     unsigned SGPRUsed = findImplicitSGPRRead(MI);
    2794     3765847 :     if (SGPRUsed != AMDGPU::NoRegister)
    2795       53953 :       ++ConstantBusCount;
    2796             : 
    2797    18237039 :     for (int OpIdx : OpIndices) {
    2798    10221165 :       if (OpIdx == -1)
    2799             :         break;
    2800     7235596 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2801     7235596 :       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
    2802     1584421 :         if (MO.isReg()) {
    2803     1418670 :           if (MO.getReg() != SGPRUsed)
    2804     1403330 :             ++ConstantBusCount;
    2805             :           SGPRUsed = MO.getReg();
    2806             :         } else {
    2807      165751 :           ++ConstantBusCount;
    2808      165751 :           ++LiteralCount;
    2809             :         }
    2810             :       }
    2811             :     }
    2812     3765847 :     if (ConstantBusCount > 1) {
    2813           0 :       ErrInfo = "VOP* instruction uses the constant bus more than once";
    2814           0 :       return false;
    2815             :     }
    2816             : 
    2817     3765847 :     if (isVOP3(MI) && LiteralCount) {
    2818           0 :       ErrInfo = "VOP3 instruction uses literal";
    2819           0 :       return false;
    2820             :     }
    2821             :   }
    2822             : 
    2823             :   // Verify misc. restrictions on specific instructions.
    2824    20943446 :   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
    2825             :       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
    2826       11987 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    2827       11987 :     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
    2828       11987 :     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
    2829       35441 :     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
    2830       17098 :       if (!compareMachineOp(Src0, Src1) &&
    2831             :           !compareMachineOp(Src0, Src2)) {
    2832           0 :         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
    2833           0 :         return false;
    2834             :       }
    2835             :     }
    2836             :   }
    2837             : 
    2838    10471723 :   if (isSOPK(MI)) {
    2839       12229 :     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
    2840       12229 :     if (sopkIsZext(MI)) {
    2841         792 :       if (!isUInt<16>(Imm)) {
    2842           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    2843           0 :         return false;
    2844             :       }
    2845             :     } else {
    2846       11437 :       if (!isInt<16>(Imm)) {
    2847           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    2848           0 :         return false;
    2849             :       }
    2850             :     }
    2851             :   }
    2852             : 
    2853    10469524 :   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
    2854    10469524 :       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
    2855    20940476 :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    2856             :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
    2857        2970 :     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    2858             :                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
    2859             : 
    2860        2970 :     const unsigned StaticNumOps = Desc.getNumOperands() +
    2861        5940 :       Desc.getNumImplicitUses();
    2862        2970 :     const unsigned NumImplicitOps = IsDst ? 2 : 1;
    2863             : 
    2864             :     // Allow additional implicit operands. This allows a fixup done by the post
    2865             :     // RA scheduler where the main implicit operand is killed and implicit-defs
    2866             :     // are added for sub-registers that remain live after this instruction.
    2867        2970 :     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
    2868           0 :       ErrInfo = "missing implicit register operands";
    2869           0 :       return false;
    2870             :     }
    2871             : 
    2872             :     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    2873        2970 :     if (IsDst) {
    2874         771 :       if (!Dst->isUse()) {
    2875           0 :         ErrInfo = "v_movreld_b32 vdst should be a use operand";
    2876           0 :         return false;
    2877             :       }
    2878             : 
    2879             :       unsigned UseOpIdx;
    2880        1542 :       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
    2881         771 :           UseOpIdx != StaticNumOps + 1) {
    2882           0 :         ErrInfo = "movrel implicit operands should be tied";
    2883           0 :         return false;
    2884             :       }
    2885             :     }
    2886             : 
    2887        2970 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    2888             :     const MachineOperand &ImpUse
    2889        2970 :       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
    2890        8910 :     if (!ImpUse.isReg() || !ImpUse.isUse() ||
    2891        2970 :         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
    2892           0 :       ErrInfo = "src0 should be subreg of implicit vector use";
    2893           0 :       return false;
    2894             :     }
    2895             :   }
    2896             : 
    2897             :   // Make sure we aren't losing exec uses in the td files. This mostly requires
    2898             :   // being careful when using let Uses to try to add other use registers.
    2899    10471723 :   if (shouldReadExec(MI)) {
    2900     5912028 :     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
    2901           0 :       ErrInfo = "VALU instruction does not implicitly read exec mask";
    2902           0 :       return false;
    2903             :     }
    2904             :   }
    2905             : 
    2906    10471723 :   if (isSMRD(MI)) {
    2907     1082112 :     if (MI.mayStore()) {
    2908             :       // The register offset form of scalar stores may only use m0 as the
    2909             :       // soffset register.
    2910             :       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
    2911         355 :       if (Soff && Soff->getReg() != AMDGPU::M0) {
    2912           0 :         ErrInfo = "scalar stores must use m0 as offset register";
    2913           0 :         return false;
    2914             :       }
    2915             :     }
    2916             :   }
    2917             : 
    2918    10471723 :   if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
    2919             :     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    2920      321743 :     if (Offset->getImm() != 0) {
    2921           0 :       ErrInfo = "subtarget does not support offsets in flat instructions";
    2922           0 :       return false;
    2923             :     }
    2924             :   }
    2925             : 
    2926             :   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
    2927    10471723 :   if (DppCt) {
    2928             :     using namespace AMDGPU::DPP;
    2929             : 
    2930         470 :     unsigned DC = DppCt->getImm();
    2931         470 :     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
    2932         940 :         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
    2933         940 :         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
    2934         940 :         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
    2935         940 :         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
    2936         470 :         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
    2937           0 :       ErrInfo = "Invalid dpp_ctrl value";
    2938           0 :       return false;
    2939             :     }
    2940             :   }
    2941             : 
    2942             :   return true;
    2943             : }
    2944             : 
    2945      101309 : unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
    2946      202618 :   switch (MI.getOpcode()) {
    2947             :   default: return AMDGPU::INSTRUCTION_LIST_END;
    2948       18882 :   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
    2949       46118 :   case AMDGPU::COPY: return AMDGPU::COPY;
    2950         410 :   case AMDGPU::PHI: return AMDGPU::PHI;
    2951           7 :   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
    2952           4 :   case AMDGPU::WQM: return AMDGPU::WQM;
    2953           6 :   case AMDGPU::WWM: return AMDGPU::WWM;
    2954          22 :   case AMDGPU::S_MOV_B32:
    2955          44 :     return MI.getOperand(1).isReg() ?
    2956             :            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
    2957        2446 :   case AMDGPU::S_ADD_I32:
    2958        2446 :     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
    2959         241 :   case AMDGPU::S_ADDC_U32:
    2960         241 :     return AMDGPU::V_ADDC_U32_e32;
    2961         884 :   case AMDGPU::S_SUB_I32:
    2962         884 :     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
    2963             :     // FIXME: These are not consistently handled, and selected when the carry is
    2964             :     // used.
    2965         121 :   case AMDGPU::S_ADD_U32:
    2966         121 :     return AMDGPU::V_ADD_I32_e32;
    2967           0 :   case AMDGPU::S_SUB_U32:
    2968           0 :     return AMDGPU::V_SUB_I32_e32;
    2969           0 :   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
    2970        1166 :   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
    2971        3810 :   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
    2972        2570 :   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
    2973         254 :   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
    2974          51 :   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
    2975          30 :   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
    2976          40 :   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
    2977          12 :   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
    2978        2460 :   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
    2979         180 :   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
    2980        4774 :   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
    2981         513 :   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
    2982        3875 :   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
    2983         149 :   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
    2984         568 :   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
    2985         977 :   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
    2986        1297 :   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
    2987        1991 :   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
    2988           0 :   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
    2989          15 :   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
    2990           9 :   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
    2991          18 :   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
    2992           0 :   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
    2993           0 :   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
    2994          14 :   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
    2995           4 :   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
    2996           5 :   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
    2997           0 :   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
    2998          17 :   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
    2999          26 :   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
    3000           2 :   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
    3001           3 :   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
    3002           2 :   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
    3003           0 :   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
    3004           1 :   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
    3005           1 :   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
    3006         128 :   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
    3007          34 :   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
    3008         158 :   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
    3009           2 :   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
    3010           0 :   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
    3011          75 :   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
    3012             :   }
    3013             : }
    3014             : 
    3015     1877150 : const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
    3016             :                                                       unsigned OpNo) const {
    3017     1877150 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    3018     3754300 :   const MCInstrDesc &Desc = get(MI.getOpcode());
    3019     5101317 :   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
    3020     1471564 :       Desc.OpInfo[OpNo].RegClass == -1) {
    3021     1680942 :     unsigned Reg = MI.getOperand(OpNo).getReg();
    3022             : 
    3023      840471 :     if (TargetRegisterInfo::isVirtualRegister(Reg))
    3024      517207 :       return MRI.getRegClass(Reg);
    3025      323264 :     return RI.getPhysRegClass(Reg);
    3026             :   }
    3027             : 
    3028     1036679 :   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
    3029     2073358 :   return RI.getRegClass(RCID);
    3030             : }
    3031             : 
    3032      124360 : bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
    3033      124360 :   switch (MI.getOpcode()) {
    3034       77841 :   case AMDGPU::COPY:
    3035             :   case AMDGPU::REG_SEQUENCE:
    3036             :   case AMDGPU::PHI:
    3037             :   case AMDGPU::INSERT_SUBREG:
    3038       77841 :     return RI.hasVGPRs(getOpRegClass(MI, 0));
    3039       46519 :   default:
    3040       46519 :     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
    3041             :   }
    3042             : }
    3043             : 
    3044       15527 : void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
    3045             :   MachineBasicBlock::iterator I = MI;
    3046       15527 :   MachineBasicBlock *MBB = MI.getParent();
    3047       15527 :   MachineOperand &MO = MI.getOperand(OpIdx);
    3048       15527 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    3049       46581 :   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
    3050       15527 :   const TargetRegisterClass *RC = RI.getRegClass(RCID);
    3051             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
    3052       15527 :   if (MO.isReg())
    3053             :     Opcode = AMDGPU::COPY;
    3054           0 :   else if (RI.isSGPRClass(RC))
    3055             :     Opcode = AMDGPU::S_MOV_B32;
    3056             : 
    3057       15527 :   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
    3058       15527 :   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
    3059             :     VRC = &AMDGPU::VReg_64RegClass;
    3060             :   else
    3061             :     VRC = &AMDGPU::VGPR_32RegClass;
    3062             : 
    3063       15527 :   unsigned Reg = MRI.createVirtualRegister(VRC);
    3064             :   DebugLoc DL = MBB->findDebugLoc(I);
    3065       46581 :   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
    3066       15527 :   MO.ChangeToRegister(Reg, false);
    3067       15527 : }
    3068             : 
    3069       28046 : unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
    3070             :                                          MachineRegisterInfo &MRI,
    3071             :                                          MachineOperand &SuperReg,
    3072             :                                          const TargetRegisterClass *SuperRC,
    3073             :                                          unsigned SubIdx,
    3074             :                                          const TargetRegisterClass *SubRC)
    3075             :                                          const {
    3076       28046 :   MachineBasicBlock *MBB = MI->getParent();
    3077             :   DebugLoc DL = MI->getDebugLoc();
    3078       28046 :   unsigned SubReg = MRI.createVirtualRegister(SubRC);
    3079             : 
    3080       28046 :   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
    3081       84138 :     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    3082       28046 :       .addReg(SuperReg.getReg(), 0, SubIdx);
    3083       28046 :     return SubReg;
    3084             :   }
    3085             : 
    3086             :   // Just in case the super register is itself a sub-register, copy it to a new
    3087             :   // value so we don't need to worry about merging its subreg index with the
    3088             :   // SubIdx passed to this function. The register coalescer should be able to
    3089             :   // eliminate this extra copy.
    3090           0 :   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
    3091             : 
    3092           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
    3093           0 :     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
    3094             : 
    3095           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    3096           0 :     .addReg(NewSuperReg, 0, SubIdx);
    3097             : 
    3098           0 :   return SubReg;
    3099             : }
    3100             : 
    3101       27992 : MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
    3102             :   MachineBasicBlock::iterator MII,
    3103             :   MachineRegisterInfo &MRI,
    3104             :   MachineOperand &Op,
    3105             :   const TargetRegisterClass *SuperRC,
    3106             :   unsigned SubIdx,
    3107             :   const TargetRegisterClass *SubRC) const {
    3108       27992 :   if (Op.isImm()) {
    3109           0 :     if (SubIdx == AMDGPU::sub0)
    3110           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
    3111           0 :     if (SubIdx == AMDGPU::sub1)
    3112           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
    3113             : 
    3114           0 :     llvm_unreachable("Unhandled register index for immediate");
    3115             :   }
    3116             : 
    3117             :   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
    3118       27992 :                                        SubIdx, SubRC);
    3119             :   return MachineOperand::CreateReg(SubReg, false);
    3120             : }
    3121             : 
    3122             : // Change the order of operands from (0, 1, 2) to (0, 2, 1)
    3123        6438 : void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
    3124             :   assert(Inst.getNumExplicitOperands() == 3);
    3125        6438 :   MachineOperand Op1 = Inst.getOperand(1);
    3126        6438 :   Inst.RemoveOperand(1);
    3127        6438 :   Inst.addOperand(Op1);
    3128        6438 : }
    3129             : 
    3130      350050 : bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
    3131             :                                     const MCOperandInfo &OpInfo,
    3132             :                                     const MachineOperand &MO) const {
    3133      350050 :   if (!MO.isReg())
    3134             :     return false;
    3135             : 
    3136      349527 :   unsigned Reg = MO.getReg();
    3137             :   const TargetRegisterClass *RC =
    3138      357377 :     TargetRegisterInfo::isVirtualRegister(Reg) ?
    3139             :     MRI.getRegClass(Reg) :
    3140        7850 :     RI.getPhysRegClass(Reg);
    3141             : 
    3142             :   const SIRegisterInfo *TRI =
    3143      349527 :       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
    3144      349527 :   RC = TRI->getSubRegClass(RC, MO.getSubReg());
    3145             : 
    3146             :   // In order to be legal, the common sub-class must be equal to the
    3147             :   // class of the current operand.  For example:
    3148             :   //
    3149             :   // v_mov_b32 s0 ; Operand defined as vsrc_b32
    3150             :   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
    3151             :   //
    3152             :   // s_sendmsg 0, s0 ; Operand defined as m0reg
    3153             :   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
    3154             : 
    3155      699054 :   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
    3156             : }
    3157             : 
    3158           0 : bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
    3159             :                                      const MCOperandInfo &OpInfo,
    3160             :                                      const MachineOperand &MO) const {
    3161           0 :   if (MO.isReg())
    3162           0 :     return isLegalRegOperand(MRI, OpInfo, MO);
    3163             : 
    3164             :   // Handle non-register types that are treated like immediates.
    3165             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    3166             :   return true;
    3167             : }
    3168             : 
    3169      449083 : bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
    3170             :                                  const MachineOperand *MO) const {
    3171      449083 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    3172      449083 :   const MCInstrDesc &InstDesc = MI.getDesc();
    3173      449083 :   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
    3174             :   const TargetRegisterClass *DefinedRC =
    3175      449083 :       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
    3176      449083 :   if (!MO)
    3177           0 :     MO = &MI.getOperand(OpIdx);
    3178             : 
    3179      449083 :   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
    3180             : 
    3181             :     RegSubRegPair SGPRUsed;
    3182      103262 :     if (MO->isReg())
    3183       80341 :       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
    3184             : 
    3185      552504 :     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
    3186      478810 :       if (i == OpIdx)
    3187       95224 :         continue;
    3188      383586 :       const MachineOperand &Op = MI.getOperand(i);
    3189      383586 :       if (Op.isReg()) {
    3190      630420 :         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
    3191      270475 :             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
    3192             :           return false;
    3193             :         }
    3194       68521 :       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
    3195             :         return false;
    3196             :       }
    3197             :     }
    3198             :   }
    3199             : 
    3200      419515 :   if (MO->isReg()) {
    3201             :     assert(DefinedRC);
    3202      334489 :     return isLegalRegOperand(MRI, OpInfo, *MO);
    3203             :   }
    3204             : 
    3205             :   // Handle non-register types that are treated like immediates.
    3206             :   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
    3207             : 
    3208       85026 :   if (!DefinedRC) {
    3209             :     // This operand expects an immediate.
    3210             :     return true;
    3211             :   }
    3212             : 
    3213       85026 :   return isImmOperandLegal(MI, OpIdx, *MO);
    3214             : }
    3215             : 
    3216        8470 : void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
    3217             :                                        MachineInstr &MI) const {
    3218        8470 :   unsigned Opc = MI.getOpcode();
    3219        8470 :   const MCInstrDesc &InstrDesc = get(Opc);
    3220             : 
    3221        8470 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    3222        8470 :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    3223             : 
    3224             :   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
    3225             :   // we need to only have one constant bus use.
    3226             :   //
    3227             :   // Note we do not need to worry about literal constants here. They are
    3228             :   // disabled for the operand type for instructions because they will always
    3229             :   // violate the one constant bus use rule.
    3230        8470 :   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
    3231        8470 :   if (HasImplicitSGPR) {
    3232         241 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3233         241 :     MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3234             : 
    3235         241 :     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
    3236          52 :       legalizeOpWithMove(MI, Src0Idx);
    3237             :   }
    3238             : 
    3239             :   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
    3240             :   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
    3241             :   // src0/src1 with V_READFIRSTLANE.
    3242        8470 :   if (Opc == AMDGPU::V_WRITELANE_B32) {
    3243           2 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3244           2 :     MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3245             :     const DebugLoc &DL = MI.getDebugLoc();
    3246           2 :     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
    3247           0 :       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3248           0 :       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3249             :           .add(Src0);
    3250           0 :       Src0.ChangeToRegister(Reg, false);
    3251             :     }
    3252           2 :     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
    3253           2 :       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3254             :       const DebugLoc &DL = MI.getDebugLoc();
    3255           6 :       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3256             :           .add(Src1);
    3257           2 :       Src1.ChangeToRegister(Reg, false);
    3258             :     }
    3259             :     return;
    3260             :   }
    3261             : 
    3262             :   // VOP2 src0 instructions support all operand types, so we don't need to check
    3263             :   // their legality. If src1 is already legal, we don't need to do anything.
    3264        8468 :   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
    3265             :     return;
    3266             : 
    3267             :   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
    3268             :   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
    3269             :   // select is uniform.
    3270        7285 :   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
    3271           1 :       RI.isVGPR(MRI, Src1.getReg())) {
    3272           1 :     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3273             :     const DebugLoc &DL = MI.getDebugLoc();
    3274           3 :     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3275             :         .add(Src1);
    3276           1 :     Src1.ChangeToRegister(Reg, false);
    3277           1 :     return;
    3278             :   }
    3279             : 
    3280             :   // We do not use commuteInstruction here because it is too aggressive and will
    3281             :   // commute if it is possible. We only want to commute here if it improves
    3282             :   // legality. This can be called a fairly large number of times so don't waste
    3283             :   // compile time pointlessly swapping and checking legality again.
    3284       14375 :   if (HasImplicitSGPR || !MI.isCommutable()) {
    3285         189 :     legalizeOpWithMove(MI, Src1Idx);
    3286         189 :     return;
    3287             :   }
    3288             : 
    3289        7093 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3290        7093 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3291             : 
    3292             :   // If src0 can be used as src1, commuting will make the operands legal.
    3293             :   // Otherwise we have to give up and insert a move.
    3294             :   //
    3295             :   // TODO: Other immediate-like operand kinds could be commuted if there was a
    3296             :   // MachineOperand::ChangeTo* for them.
    3297       20756 :   if ((!Src1.isImm() && !Src1.isReg()) ||
    3298        7093 :       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
    3299           0 :     legalizeOpWithMove(MI, Src1Idx);
    3300           0 :     return;
    3301             :   }
    3302             : 
    3303             :   int CommutedOpc = commuteOpcode(MI);
    3304        7093 :   if (CommutedOpc == -1) {
    3305           0 :     legalizeOpWithMove(MI, Src1Idx);
    3306           0 :     return;
    3307             :   }
    3308             : 
    3309        7093 :   MI.setDesc(get(CommutedOpc));
    3310             : 
    3311        7093 :   unsigned Src0Reg = Src0.getReg();
    3312             :   unsigned Src0SubReg = Src0.getSubReg();
    3313             :   bool Src0Kill = Src0.isKill();
    3314             : 
    3315        7093 :   if (Src1.isImm())
    3316         523 :     Src0.ChangeToImmediate(Src1.getImm());
    3317        6570 :   else if (Src1.isReg()) {
    3318        6570 :     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
    3319             :     Src0.setSubReg(Src1.getSubReg());
    3320             :   } else
    3321           0 :     llvm_unreachable("Should only have register or immediate operands");
    3322             : 
    3323        7093 :   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
    3324             :   Src1.setSubReg(Src0SubReg);
    3325             : }
    3326             : 
    3327             : // Legalize VOP3 operands. Because all operand types are supported for any
    3328             : // operand, and since literal constants are not allowed and should never be
    3329             : // seen, we only need to worry about inserting copies if we use multiple SGPR
    3330             : // operands.
    3331       59658 : void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
    3332             :                                        MachineInstr &MI) const {
    3333       59658 :   unsigned Opc = MI.getOpcode();
    3334             : 
    3335             :   int VOP3Idx[3] = {
    3336       59658 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
    3337       59658 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
    3338       59658 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
    3339      178974 :   };
    3340             : 
    3341             :   // Find the one SGPR operand we are allowed to use.
    3342       59658 :   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
    3343             : 
    3344      324084 :   for (unsigned i = 0; i < 3; ++i) {
    3345      173701 :     int Idx = VOP3Idx[i];
    3346      173701 :     if (Idx == -1)
    3347             :       break;
    3348      132213 :     MachineOperand &MO = MI.getOperand(Idx);
    3349             : 
    3350             :     // We should never see a VOP3 instruction with an illegal immediate operand.
    3351      132213 :     if (!MO.isReg())
    3352       11825 :       continue;
    3353             : 
    3354      240776 :     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
    3355       62780 :       continue; // VGPRs are legal
    3356             : 
    3357       99930 :     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
    3358       42322 :       SGPRReg = MO.getReg();
    3359             :       // We can use one SGPR in each VOP3 instruction.
    3360       42322 :       continue;
    3361             :     }
    3362             : 
    3363             :     // If we make it this far, then the operand is not legal and we must
    3364             :     // legalize it.
    3365       15286 :     legalizeOpWithMove(MI, Idx);
    3366             :   }
    3367       59658 : }
    3368             : 
    3369          43 : unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
    3370             :                                          MachineRegisterInfo &MRI) const {
    3371             :   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
    3372          43 :   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
    3373          43 :   unsigned DstReg = MRI.createVirtualRegister(SRC);
    3374          43 :   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
    3375             : 
    3376          43 :   if (SubRegs == 1) {
    3377           4 :     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3378           2 :             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
    3379           2 :         .addReg(SrcReg);
    3380           2 :     return DstReg;
    3381             :   }
    3382             : 
    3383             :   SmallVector<unsigned, 8> SRegs;
    3384         269 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3385         114 :     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3386         228 :     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3387         114 :             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
    3388         114 :         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
    3389         114 :     SRegs.push_back(SGPR);
    3390             :   }
    3391             : 
    3392             :   MachineInstrBuilder MIB =
    3393          41 :       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3394          82 :               get(AMDGPU::REG_SEQUENCE), DstReg);
    3395         269 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3396         228 :     MIB.addReg(SRegs[i]);
    3397         114 :     MIB.addImm(RI.getSubRegFromChannel(i));
    3398             :   }
    3399             :   return DstReg;
    3400             : }
    3401             : 
    3402          29 : void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
    3403             :                                        MachineInstr &MI) const {
    3404             : 
    3405             :   // If the pointer is store in VGPRs, then we need to move them to
    3406             :   // SGPRs using v_readfirstlane.  This is safe because we only select
    3407             :   // loads with uniform pointers to SMRD instruction so we know the
    3408             :   // pointer value is uniform.
    3409          29 :   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
    3410          58 :   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
    3411          29 :       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
    3412          29 :       SBase->setReg(SGPR);
    3413             :   }
    3414          29 : }
    3415             : 
    3416       26924 : void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
    3417             :                                          MachineBasicBlock::iterator I,
    3418             :                                          const TargetRegisterClass *DstRC,
    3419             :                                          MachineOperand &Op,
    3420             :                                          MachineRegisterInfo &MRI,
    3421             :                                          const DebugLoc &DL) const {
    3422       26924 :   unsigned OpReg = Op.getReg();
    3423             :   unsigned OpSubReg = Op.getSubReg();
    3424             : 
    3425       26924 :   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
    3426       26924 :       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
    3427             : 
    3428             :   // Check if operand is already the correct register class.
    3429       26924 :   if (DstRC == OpRC)
    3430             :     return;
    3431             : 
    3432       26611 :   unsigned DstReg = MRI.createVirtualRegister(DstRC);
    3433             :   MachineInstr *Copy =
    3434       79833 :       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
    3435             : 
    3436       26611 :   Op.setReg(DstReg);
    3437             :   Op.setSubReg(0);
    3438             : 
    3439       26611 :   MachineInstr *Def = MRI.getVRegDef(OpReg);
    3440       26611 :   if (!Def)
    3441             :     return;
    3442             : 
    3443             :   // Try to eliminate the copy if it is copying an immediate value.
    3444       26611 :   if (Def->isMoveImmediate())
    3445        6573 :     FoldImmediate(*Copy, *Def, OpReg, &MRI);
    3446             : }
    3447             : 
    3448       86030 : void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
    3449       86030 :   MachineFunction &MF = *MI.getParent()->getParent();
    3450       86030 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3451             : 
    3452             :   // Legalize VOP2
    3453      163665 :   if (isVOP2(MI) || isVOPC(MI)) {
    3454        8470 :     legalizeOperandsVOP2(MRI, MI);
    3455        8470 :     return;
    3456             :   }
    3457             : 
    3458             :   // Legalize VOP3
    3459       77560 :   if (isVOP3(MI)) {
    3460       29551 :     legalizeOperandsVOP3(MRI, MI);
    3461       29551 :     return;
    3462             :   }
    3463             : 
    3464             :   // Legalize SMRD
    3465       48009 :   if (isSMRD(MI)) {
    3466          29 :     legalizeOperandsSMRD(MRI, MI);
    3467          29 :     return;
    3468             :   }
    3469             : 
    3470             :   // Legalize REG_SEQUENCE and PHI
    3471             :   // The register class of the operands much be the same type as the register
    3472             :   // class of the output.
    3473       47980 :   if (MI.getOpcode() == AMDGPU::PHI) {
    3474             :     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
    3475        1226 :     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
    3476        2448 :       if (!MI.getOperand(i).isReg() ||
    3477         816 :           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
    3478           0 :         continue;
    3479             :       const TargetRegisterClass *OpRC =
    3480             :           MRI.getRegClass(MI.getOperand(i).getReg());
    3481         816 :       if (RI.hasVGPRs(OpRC)) {
    3482             :         VRC = OpRC;
    3483             :       } else {
    3484             :         SRC = OpRC;
    3485             :       }
    3486             :     }
    3487             : 
    3488             :     // If any of the operands are VGPR registers, then they all most be
    3489             :     // otherwise we will create illegal VGPR->SGPR copies when legalizing
    3490             :     // them.
    3491         544 :     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
    3492         410 :       if (!VRC) {
    3493             :         assert(SRC);
    3494         134 :         VRC = RI.getEquivalentVGPRClass(SRC);
    3495             :       }
    3496             :       RC = VRC;
    3497             :     } else {
    3498             :       RC = SRC;
    3499             :     }
    3500             : 
    3501             :     // Update all the operands so they have the same type.
    3502        1226 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3503         816 :       MachineOperand &Op = MI.getOperand(I);
    3504        1632 :       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3505           0 :         continue;
    3506             : 
    3507             :       // MI is a PHI instruction.
    3508        1632 :       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
    3509         816 :       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
    3510             : 
    3511             :       // Avoid creating no-op copies with the same src and dst reg class.  These
    3512             :       // confuse some of the machine passes.
    3513         816 :       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
    3514             :     }
    3515             :   }
    3516             : 
    3517             :   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
    3518             :   // VGPR dest type and SGPR sources, insert copies so all operands are
    3519             :   // VGPRs. This seems to help operand folding / the register coalescer.
    3520       95960 :   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
    3521       18882 :     MachineBasicBlock *MBB = MI.getParent();
    3522       18882 :     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
    3523       18882 :     if (RI.hasVGPRs(DstRC)) {
    3524             :       // Update all the operands so they are VGPR register classes. These may
    3525             :       // not be the same register class because REG_SEQUENCE supports mixing
    3526             :       // subregister index types e.g. sub0_sub1 + sub2 + sub3
    3527       69264 :       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3528       50382 :         MachineOperand &Op = MI.getOperand(I);
    3529      100764 :         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3530           0 :           continue;
    3531             : 
    3532             :         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
    3533       50382 :         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
    3534       74691 :         if (VRC == OpRC)
    3535       24309 :           continue;
    3536             : 
    3537       26073 :         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
    3538             :         Op.setIsKill();
    3539             :       }
    3540             :     }
    3541             : 
    3542             :     return;
    3543             :   }
    3544             : 
    3545             :   // Legalize INSERT_SUBREG
    3546             :   // src0 must have the same register class as dst
    3547       29098 :   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
    3548           7 :     unsigned Dst = MI.getOperand(0).getReg();
    3549           7 :     unsigned Src0 = MI.getOperand(1).getReg();
    3550             :     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
    3551             :     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
    3552           7 :     if (DstRC != Src0RC) {
    3553           5 :       MachineBasicBlock *MBB = MI.getParent();
    3554             :       MachineOperand &Op = MI.getOperand(1);
    3555           5 :       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
    3556             :     }
    3557             :     return;
    3558             :   }
    3559             : 
    3560             :   // Legalize SI_INIT_M0
    3561       29091 :   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
    3562           2 :     MachineOperand &Src = MI.getOperand(0);
    3563           4 :     if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
    3564           2 :       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
    3565             :     return;
    3566             :   }
    3567             : 
    3568             :   // Legalize MIMG and MUBUF/MTBUF for shaders.
    3569             :   //
    3570             :   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
    3571             :   // scratch memory access. In both cases, the legalization never involves
    3572             :   // conversion to the addr64 form.
    3573       58174 :   if (isMIMG(MI) ||
    3574       58508 :       (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
    3575         294 :        (isMUBUF(MI) || isMTBUF(MI)))) {
    3576          48 :     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
    3577         144 :     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
    3578          10 :       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
    3579          10 :       SRsrc->setReg(SGPR);
    3580             :     }
    3581             : 
    3582          48 :     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
    3583          52 :     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
    3584           2 :       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
    3585           2 :       SSamp->setReg(SGPR);
    3586             :     }
    3587             :     return;
    3588             :   }
    3589             : 
    3590             :   // Legalize MUBUF* instructions by converting to addr64 form.
    3591             :   // FIXME: If we start using the non-addr64 instructions for compute, we
    3592             :   // may need to legalize them as above. This especially applies to the
    3593             :   // buffer_load_format_* variants and variants with idxen (or bothen).
    3594             :   int SRsrcIdx =
    3595       29041 :       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
    3596       29041 :   if (SRsrcIdx != -1) {
    3597             :     // We have an MUBUF instruction
    3598          54 :     MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
    3599         108 :     unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
    3600         162 :     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
    3601             :                                              RI.getRegClass(SRsrcRC))) {
    3602             :       // The operands are legal.
    3603             :       // FIXME: We may need to legalize operands besided srsrc.
    3604             :       return;
    3605             :     }
    3606             : 
    3607          54 :     MachineBasicBlock &MBB = *MI.getParent();
    3608             : 
    3609             :     // Extract the ptr from the resource descriptor.
    3610          54 :     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
    3611          54 :       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
    3612             : 
    3613             :     // Create an empty resource descriptor
    3614          54 :     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3615          54 :     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3616          54 :     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3617          54 :     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
    3618          54 :     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
    3619             : 
    3620             :     // Zero64 = 0
    3621         162 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
    3622             :         .addImm(0);
    3623             : 
    3624             :     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
    3625         162 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
    3626          54 :         .addImm(RsrcDataFormat & 0xFFFFFFFF);
    3627             : 
    3628             :     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
    3629         162 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
    3630          54 :         .addImm(RsrcDataFormat >> 32);
    3631             : 
    3632             :     // NewSRsrc = {Zero64, SRsrcFormat}
    3633         162 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
    3634          54 :         .addReg(Zero64)
    3635             :         .addImm(AMDGPU::sub0_sub1)
    3636          54 :         .addReg(SRsrcFormatLo)
    3637             :         .addImm(AMDGPU::sub2)
    3638          54 :         .addReg(SRsrcFormatHi)
    3639             :         .addImm(AMDGPU::sub3);
    3640             : 
    3641          54 :     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    3642          54 :     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    3643          54 :     if (VAddr) {
    3644             :       // This is already an ADDR64 instruction so we need to add the pointer
    3645             :       // extracted from the resource descriptor to the current value of VAddr.
    3646           9 :       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3647           9 :       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3648             : 
    3649             :       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
    3650             :       DebugLoc DL = MI.getDebugLoc();
    3651          27 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
    3652           9 :         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
    3653           9 :         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
    3654             : 
    3655             :       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
    3656          27 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
    3657           9 :         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
    3658           9 :         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
    3659             : 
    3660             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    3661          27 :       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
    3662           9 :           .addReg(NewVAddrLo)
    3663             :           .addImm(AMDGPU::sub0)
    3664           9 :           .addReg(NewVAddrHi)
    3665             :           .addImm(AMDGPU::sub1);
    3666             :     } else {
    3667             :       // This instructions is the _OFFSET variant, so we need to convert it to
    3668             :       // ADDR64.
    3669             :       assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
    3670             :              < SISubtarget::VOLCANIC_ISLANDS &&
    3671             :              "FIXME: Need to emit flat atomics here");
    3672             : 
    3673          45 :       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
    3674          45 :       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    3675          45 :       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
    3676          90 :       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
    3677             : 
    3678             :       // Atomics rith return have have an additional tied operand and are
    3679             :       // missing some of the special bits.
    3680          45 :       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
    3681             :       MachineInstr *Addr64;
    3682             : 
    3683          45 :       if (!VDataIn) {
    3684             :         // Regular buffer load / store.
    3685             :         MachineInstrBuilder MIB =
    3686         132 :             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    3687             :                 .add(*VData)
    3688          44 :                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
    3689             :                 // This will be replaced later
    3690             :                 // with the new value of vaddr.
    3691             :                 .add(*SRsrc)
    3692             :                 .add(*SOffset)
    3693          44 :                 .add(*Offset);
    3694             : 
    3695             :         // Atomics do not have this operand.
    3696          44 :         if (const MachineOperand *GLC =
    3697             :                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
    3698          43 :           MIB.addImm(GLC->getImm());
    3699             :         }
    3700             : 
    3701             :         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
    3702             : 
    3703          44 :         if (const MachineOperand *TFE =
    3704             :                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
    3705          43 :           MIB.addImm(TFE->getImm());
    3706             :         }
    3707             : 
    3708          44 :         MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3709             :         Addr64 = MIB;
    3710             :       } else {
    3711             :         // Atomics with return.
    3712           3 :         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    3713             :                      .add(*VData)
    3714             :                      .add(*VDataIn)
    3715           1 :                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
    3716             :                      // This will be replaced later
    3717             :                      // with the new value of vaddr.
    3718             :                      .add(*SRsrc)
    3719             :                      .add(*SOffset)
    3720             :                      .add(*Offset)
    3721             :                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
    3722           1 :                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3723             :       }
    3724             : 
    3725          45 :       MI.removeFromParent();
    3726             : 
    3727             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    3728          90 :       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
    3729          45 :               NewVAddr)
    3730          45 :           .addReg(SRsrcPtr, 0, AMDGPU::sub0)
    3731             :           .addImm(AMDGPU::sub0)
    3732          45 :           .addReg(SRsrcPtr, 0, AMDGPU::sub1)
    3733             :           .addImm(AMDGPU::sub1);
    3734             : 
    3735          45 :       VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
    3736          45 :       SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
    3737             :     }
    3738             : 
    3739             :     // Update the instruction to use NewVaddr
    3740          54 :     VAddr->setReg(NewVAddr);
    3741             :     // Update the instruction to use NewSRsrc
    3742          54 :     SRsrc->setReg(NewSRsrc);
    3743             :   }
    3744             : }
    3745             : 
    3746       36008 : void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
    3747             :   SetVectorType Worklist;
    3748       36008 :   Worklist.insert(&TopInst);
    3749             : 
    3750      137317 :   while (!Worklist.empty()) {
    3751             :     MachineInstr &Inst = *Worklist.pop_back_val();
    3752      101309 :     MachineBasicBlock *MBB = Inst.getParent();
    3753      101309 :     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    3754             : 
    3755      101309 :     unsigned Opcode = Inst.getOpcode();
    3756      101309 :     unsigned NewOpcode = getVALUOp(Inst);
    3757             : 
    3758             :     // Handle some special cases
    3759      101309 :     switch (Opcode) {
    3760             :     default:
    3761             :       break;
    3762        4261 :     case AMDGPU::S_ADD_U64_PSEUDO:
    3763             :     case AMDGPU::S_SUB_U64_PSEUDO:
    3764        4261 :       splitScalar64BitAddSub(Worklist, Inst);
    3765        4261 :       Inst.eraseFromParent();
    3766        4261 :       continue;
    3767        3330 :     case AMDGPU::S_ADD_I32:
    3768             :     case AMDGPU::S_SUB_I32:
    3769             :       // FIXME: The u32 versions currently selected use the carry.
    3770        3330 :       if (moveScalarAddSub(Worklist, Inst))
    3771         262 :         continue;
    3772             : 
    3773             :       // Default handling
    3774             :       break;
    3775         115 :     case AMDGPU::S_AND_B64:
    3776         115 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
    3777         115 :       Inst.eraseFromParent();
    3778         115 :       continue;
    3779             : 
    3780         196 :     case AMDGPU::S_OR_B64:
    3781         196 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
    3782         196 :       Inst.eraseFromParent();
    3783         196 :       continue;
    3784             : 
    3785         130 :     case AMDGPU::S_XOR_B64:
    3786         130 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
    3787         130 :       Inst.eraseFromParent();
    3788         130 :       continue;
    3789             : 
    3790          18 :     case AMDGPU::S_NOT_B64:
    3791          18 :       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
    3792          18 :       Inst.eraseFromParent();
    3793          18 :       continue;
    3794             : 
    3795          26 :     case AMDGPU::S_BCNT1_I32_B64:
    3796          26 :       splitScalar64BitBCNT(Worklist, Inst);
    3797          26 :       Inst.eraseFromParent();
    3798          26 :       continue;
    3799             : 
    3800        1812 :     case AMDGPU::S_BFE_I64:
    3801        1812 :       splitScalar64BitBFE(Worklist, Inst);
    3802        1812 :       Inst.eraseFromParent();
    3803        1812 :       continue;
    3804             : 
    3805        4774 :     case AMDGPU::S_LSHL_B32:
    3806        4774 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3807             :         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
    3808        2440 :         swapOperands(Inst);
    3809             :       }
    3810             :       break;
    3811        2460 :     case AMDGPU::S_ASHR_I32:
    3812        2460 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3813             :         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
    3814        1371 :         swapOperands(Inst);
    3815             :       }
    3816             :       break;
    3817        3875 :     case AMDGPU::S_LSHR_B32:
    3818        3875 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3819             :         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
    3820        2336 :         swapOperands(Inst);
    3821             :       }
    3822             :       break;
    3823         513 :     case AMDGPU::S_LSHL_B64:
    3824         513 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3825             :         NewOpcode = AMDGPU::V_LSHLREV_B64;
    3826         155 :         swapOperands(Inst);
    3827             :       }
    3828             :       break;
    3829         180 :     case AMDGPU::S_ASHR_I64:
    3830         180 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3831             :         NewOpcode = AMDGPU::V_ASHRREV_I64;
    3832          58 :         swapOperands(Inst);
    3833             :       }
    3834             :       break;
    3835         149 :     case AMDGPU::S_LSHR_B64:
    3836         149 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3837             :         NewOpcode = AMDGPU::V_LSHRREV_B64;
    3838          78 :         swapOperands(Inst);
    3839             :       }
    3840             :       break;
    3841             : 
    3842          24 :     case AMDGPU::S_ABS_I32:
    3843          24 :       lowerScalarAbs(Worklist, Inst);
    3844          24 :       Inst.eraseFromParent();
    3845          24 :       continue;
    3846             : 
    3847          75 :     case AMDGPU::S_CBRANCH_SCC0:
    3848             :     case AMDGPU::S_CBRANCH_SCC1:
    3849             :       // Clear unused bits of vcc
    3850         150 :       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
    3851          75 :               AMDGPU::VCC)
    3852          75 :           .addReg(AMDGPU::EXEC)
    3853          75 :           .addReg(AMDGPU::VCC);
    3854          75 :       break;
    3855             : 
    3856           0 :     case AMDGPU::S_BFE_U64:
    3857             :     case AMDGPU::S_BFM_B64:
    3858           0 :       llvm_unreachable("Moving this op to VALU not implemented");
    3859             : 
    3860         215 :     case AMDGPU::S_PACK_LL_B32_B16:
    3861             :     case AMDGPU::S_PACK_LH_B32_B16:
    3862             :     case AMDGPU::S_PACK_HH_B32_B16:
    3863         215 :       movePackToVALU(Worklist, MRI, Inst);
    3864         215 :       Inst.eraseFromParent();
    3865         215 :       continue;
    3866             : 
    3867          15 :     case AMDGPU::S_XNOR_B32:
    3868          15 :       lowerScalarXnor(Worklist, Inst);
    3869          15 :       Inst.eraseFromParent();
    3870          15 :       continue;
    3871             : 
    3872           5 :     case AMDGPU::S_XNOR_B64:
    3873           5 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
    3874           5 :       Inst.eraseFromParent();
    3875           5 :       continue;
    3876             : 
    3877             :     case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
    3878          40 :       unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3879          40 :       const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
    3880          40 :       auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
    3881             :       unsigned Offset = 0;
    3882             : 
    3883             :       // FIXME: This isn't safe because the addressing mode doesn't work
    3884             :       // correctly if vaddr is negative.
    3885             :       //
    3886             :       // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
    3887             :       //
    3888             :       // See if we can extract an immediate offset by recognizing one of these:
    3889             :       //   V_ADD_I32_e32 dst, imm, src1
    3890             :       //   V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
    3891             :       // V_ADD will be removed by "Remove dead machine instructions".
    3892          80 :       if (Add &&
    3893          56 :           (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
    3894             :            Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
    3895             :         static const unsigned SrcNames[2] = {
    3896             :           AMDGPU::OpName::src0,
    3897             :           AMDGPU::OpName::src1,
    3898             :         };
    3899             : 
    3900             :         // Find a literal offset in one of source operands.
    3901          62 :         for (int i = 0; i < 2; i++) {
    3902          43 :           const MachineOperand *Src =
    3903          43 :             getNamedOperand(*Add, SrcNames[i]);
    3904             : 
    3905          43 :           if (Src->isReg()) {
    3906          43 :             auto Mov = MRI.getUniqueVRegDef(Src->getReg());
    3907          86 :             if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
    3908          32 :               Src = &Mov->getOperand(1);
    3909             :           }
    3910             : 
    3911          43 :           if (Src) {
    3912          43 :             if (Src->isImm())
    3913          32 :               Offset = Src->getImm();
    3914          11 :             else if (Src->isCImm())
    3915           0 :               Offset = Src->getCImm()->getZExtValue();
    3916             :           }
    3917             : 
    3918          32 :           if (Offset && isLegalMUBUFImmOffset(Offset)) {
    3919          28 :             VAddr = getNamedOperand(*Add, SrcNames[!i]);
    3920          28 :             break;
    3921             :           }
    3922             : 
    3923             :           Offset = 0;
    3924             :         }
    3925             :       }
    3926             : 
    3927             :       MachineInstr *NewInstr =
    3928          40 :         BuildMI(*MBB, Inst, Inst.getDebugLoc(),
    3929          40 :               get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
    3930             :         .add(*VAddr) // vaddr
    3931          40 :         .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
    3932             :         .addImm(0) // soffset
    3933          40 :         .addImm(Offset) // offset
    3934          40 :         .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
    3935             :         .addImm(0) // slc
    3936             :         .addImm(0) // tfe
    3937          40 :         .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
    3938             :         .getInstr();
    3939             : 
    3940          40 :       MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
    3941             :                          VDst);
    3942          40 :       addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
    3943          40 :       Inst.eraseFromParent();
    3944             : 
    3945             :       // Legalize all operands other than the offset. Notably, convert the srsrc
    3946             :       // into SGPRs using v_readfirstlane if needed.
    3947          40 :       legalizeOperands(*NewInstr);
    3948          40 :       continue;
    3949        6817 :     }
    3950             :     }
    3951             : 
    3952       87850 :     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
    3953             :       // We cannot move this instruction to the VALU, so we should try to
    3954             :       // legalize its operands instead.
    3955          98 :       legalizeOperands(Inst);
    3956          98 :       continue;
    3957             :     }
    3958             : 
    3959             :     // Use the new VALU Opcode.
    3960       94092 :     const MCInstrDesc &NewDesc = get(NewOpcode);
    3961             :     Inst.setDesc(NewDesc);
    3962             : 
    3963             :     // Remove any references to SCC. Vector instructions can't read from it, and
    3964             :     // We're just about to add the implicit use / defs of VCC, and we don't want
    3965             :     // both.
    3966      323803 :     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
    3967      229711 :       MachineOperand &Op = Inst.getOperand(i);
    3968      229711 :       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
    3969       25964 :         Inst.RemoveOperand(i);
    3970       25964 :         addSCCDefUsersToVALUWorklist(Inst, Worklist);
    3971             :       }
    3972             :     }
    3973             : 
    3974       94092 :     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
    3975             :       // We are converting these to a BFE, so we need to add the missing
    3976             :       // operands for the size and offset.
    3977        1545 :       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
    3978        1545 :       Inst.addOperand(MachineOperand::CreateImm(0));
    3979        3090 :       Inst.addOperand(MachineOperand::CreateImm(Size));
    3980             : 
    3981       92547 :     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
    3982             :       // The VALU version adds the second operand to the result, so insert an
    3983             :       // extra 0 operand.
    3984         128 :       Inst.addOperand(MachineOperand::CreateImm(0));
    3985             :     }
    3986             : 
    3987       94092 :     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
    3988             : 
    3989       94092 :     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
    3990        3288 :       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
    3991             :       // If we need to move this to VGPRs, we need to unpack the second operand
    3992             :       // back into the 2 separate ones for bit offset and width.
    3993             :       assert(OffsetWidthOp.isImm() &&
    3994             :              "Scalar BFE is only implemented for constant width and offset");
    3995        3288 :       uint32_t Imm = OffsetWidthOp.getImm();
    3996             : 
    3997        3288 :       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    3998        3288 :       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    3999        3288 :       Inst.RemoveOperand(2);                     // Remove old immediate.
    4000        6576 :       Inst.addOperand(MachineOperand::CreateImm(Offset));
    4001        6576 :       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
    4002             :     }
    4003             : 
    4004      282201 :     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
    4005             :     unsigned NewDstReg = AMDGPU::NoRegister;
    4006             :     if (HasDst) {
    4007       93942 :       unsigned DstReg = Inst.getOperand(0).getReg();
    4008       93942 :       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
    4009          57 :         continue;
    4010             : 
    4011             :       // Update the destination register class.
    4012       93885 :       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
    4013       93885 :       if (!NewDstRC)
    4014           0 :         continue;
    4015             : 
    4016       46083 :       if (Inst.isCopy() &&
    4017      139799 :           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
    4018       45914 :           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
    4019             :         // Instead of creating a copy where src and dst are the same register
    4020             :         // class, we just replace all uses of dst with src.  These kinds of
    4021             :         // copies interfere with the heuristics MachineSink uses to decide
    4022             :         // whether or not to split a critical edge.  Since the pass assumes
    4023             :         // that copies will end up as machine instructions and not be
    4024             :         // eliminated.
    4025       17819 :         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
    4026       17819 :         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
    4027       17819 :         MRI.clearKillFlags(Inst.getOperand(1).getReg());
    4028       17819 :         Inst.getOperand(0).setReg(DstReg);
    4029             : 
    4030             :         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
    4031             :         // these are deleted later, but at -O0 it would leave a suspicious
    4032             :         // looking illegal copy of an undef register.
    4033       35638 :         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
    4034       17819 :           Inst.RemoveOperand(I);
    4035       17819 :         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
    4036       17819 :         continue;
    4037             :       }
    4038             : 
    4039       76066 :       NewDstReg = MRI.createVirtualRegister(NewDstRC);
    4040       76066 :       MRI.replaceRegWith(DstReg, NewDstReg);
    4041             :     }
    4042             : 
    4043             :     // Legalize the operands
    4044       76216 :     legalizeOperands(Inst);
    4045             : 
    4046       76216 :     if (HasDst)
    4047       76066 :      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
    4048             :   }
    4049       36008 : }
    4050             : 
    4051             : // Add/sub require special handling to deal with carry outs.
    4052        3330 : bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
    4053             :                                    MachineInstr &Inst) const {
    4054        3330 :   if (ST.hasAddNoCarry()) {
    4055             :     // Assume there is no user of scc since we don't select this in that case.
    4056             :     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
    4057             :     // is used.
    4058             : 
    4059         262 :     MachineBasicBlock &MBB = *Inst.getParent();
    4060         262 :     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4061             : 
    4062         262 :     unsigned OldDstReg = Inst.getOperand(0).getReg();
    4063         262 :     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4064             : 
    4065         262 :     unsigned Opc = Inst.getOpcode();
    4066             :     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
    4067             : 
    4068         262 :     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
    4069             :       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
    4070             : 
    4071             :     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
    4072         262 :     Inst.RemoveOperand(3);
    4073             : 
    4074         262 :     Inst.setDesc(get(NewOpc));
    4075         262 :     Inst.addImplicitDefUseOperands(*MBB.getParent());
    4076         262 :     MRI.replaceRegWith(OldDstReg, ResultReg);
    4077         262 :     legalizeOperands(Inst);
    4078             : 
    4079         262 :     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4080         262 :     return true;
    4081             :   }
    4082             : 
    4083             :   return false;
    4084             : }
    4085             : 
    4086          24 : void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
    4087             :                                  MachineInstr &Inst) const {
    4088          24 :   MachineBasicBlock &MBB = *Inst.getParent();
    4089          24 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4090             :   MachineBasicBlock::iterator MII = Inst;
    4091             :   DebugLoc DL = Inst.getDebugLoc();
    4092             : 
    4093          24 :   MachineOperand &Dest = Inst.getOperand(0);
    4094             :   MachineOperand &Src = Inst.getOperand(1);
    4095          24 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4096          24 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4097             : 
    4098          24 :   unsigned SubOp = ST.hasAddNoCarry() ?
    4099             :     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
    4100             : 
    4101          72 :   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
    4102             :     .addImm(0)
    4103          24 :     .addReg(Src.getReg());
    4104             : 
    4105          72 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
    4106          24 :     .addReg(Src.getReg())
    4107          24 :     .addReg(TmpReg);
    4108             : 
    4109          24 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4110          24 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4111          24 : }
    4112             : 
    4113          15 : void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
    4114             :                                   MachineInstr &Inst) const {
    4115          15 :   MachineBasicBlock &MBB = *Inst.getParent();
    4116          15 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4117             :   MachineBasicBlock::iterator MII = Inst;
    4118             :   const DebugLoc &DL = Inst.getDebugLoc();
    4119             : 
    4120          15 :   MachineOperand &Dest = Inst.getOperand(0);
    4121             :   MachineOperand &Src0 = Inst.getOperand(1);
    4122             :   MachineOperand &Src1 = Inst.getOperand(2);
    4123             : 
    4124          15 :   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
    4125          15 :   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
    4126             : 
    4127          15 :   unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4128          15 :   if (ST.hasDLInsts()) {
    4129           6 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
    4130             :       .add(Src0)
    4131             :       .add(Src1);
    4132             :   } else {
    4133          12 :     unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4134          24 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
    4135             :       .add(Src0)
    4136             :       .add(Src1);
    4137             : 
    4138          36 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
    4139          12 :       .addReg(Xor);
    4140             :   }
    4141             : 
    4142          15 :   MRI.replaceRegWith(Dest.getReg(), NewDest);
    4143          15 :   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
    4144          15 : }
    4145             : 
    4146          18 : void SIInstrInfo::splitScalar64BitUnaryOp(
    4147             :     SetVectorType &Worklist, MachineInstr &Inst,
    4148             :     unsigned Opcode) const {
    4149          18 :   MachineBasicBlock &MBB = *Inst.getParent();
    4150          18 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4151             : 
    4152          18 :   MachineOperand &Dest = Inst.getOperand(0);
    4153             :   MachineOperand &Src0 = Inst.getOperand(1);
    4154             :   DebugLoc DL = Inst.getDebugLoc();
    4155             : 
    4156             :   MachineBasicBlock::iterator MII = Inst;
    4157             : 
    4158          18 :   const MCInstrDesc &InstDesc = get(Opcode);
    4159          18 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    4160          18 :     MRI.getRegClass(Src0.getReg()) :
    4161             :     &AMDGPU::SGPR_32RegClass;
    4162             : 
    4163          18 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4164             : 
    4165             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4166          18 :                                                        AMDGPU::sub0, Src0SubRC);
    4167             : 
    4168          18 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    4169          18 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    4170          18 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    4171             : 
    4172          18 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    4173          36 :   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
    4174             : 
    4175             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4176          18 :                                                        AMDGPU::sub1, Src0SubRC);
    4177             : 
    4178          18 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    4179          36 :   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
    4180             : 
    4181          18 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    4182          54 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4183          18 :     .addReg(DestSub0)
    4184             :     .addImm(AMDGPU::sub0)
    4185          18 :     .addReg(DestSub1)
    4186             :     .addImm(AMDGPU::sub1);
    4187             : 
    4188          18 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4189             : 
    4190             :   // We don't need to legalizeOperands here because for a single operand, src0
    4191             :   // will support any kind of input.
    4192             : 
    4193             :   // Move all users of this moved value.
    4194          18 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4195          18 : }
    4196             : 
    4197        4261 : void SIInstrInfo::splitScalar64BitAddSub(
    4198             :   SetVectorType &Worklist, MachineInstr &Inst) const {
    4199        4261 :   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    4200             : 
    4201        4261 :   MachineBasicBlock &MBB = *Inst.getParent();
    4202        4261 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4203             : 
    4204        4261 :   unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4205        4261 :   unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4206        4261 :   unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4207             : 
    4208        4261 :   unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    4209        4261 :   unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    4210             : 
    4211        4261 :   MachineOperand &Dest = Inst.getOperand(0);
    4212             :   MachineOperand &Src0 = Inst.getOperand(1);
    4213             :   MachineOperand &Src1 = Inst.getOperand(2);
    4214             :   const DebugLoc &DL = Inst.getDebugLoc();
    4215             :   MachineBasicBlock::iterator MII = Inst;
    4216             : 
    4217        4261 :   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
    4218        4261 :   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
    4219        4261 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4220        4261 :   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
    4221             : 
    4222             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4223        4261 :                                                        AMDGPU::sub0, Src0SubRC);
    4224             :   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4225        4261 :                                                        AMDGPU::sub0, Src1SubRC);
    4226             : 
    4227             : 
    4228             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4229        4261 :                                                        AMDGPU::sub1, Src0SubRC);
    4230             :   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4231        4261 :                                                        AMDGPU::sub1, Src1SubRC);
    4232             : 
    4233        4261 :   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
    4234             :   MachineInstr *LoHalf =
    4235       12783 :     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
    4236        4261 :     .addReg(CarryReg, RegState::Define)
    4237             :     .add(SrcReg0Sub0)
    4238        4261 :     .add(SrcReg1Sub0);
    4239             : 
    4240        4261 :   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
    4241             :   MachineInstr *HiHalf =
    4242       12783 :     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
    4243        4261 :     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
    4244             :     .add(SrcReg0Sub1)
    4245             :     .add(SrcReg1Sub1)
    4246        4261 :     .addReg(CarryReg, RegState::Kill);
    4247             : 
    4248       12783 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4249        4261 :     .addReg(DestSub0)
    4250             :     .addImm(AMDGPU::sub0)
    4251        4261 :     .addReg(DestSub1)
    4252             :     .addImm(AMDGPU::sub1);
    4253             : 
    4254        4261 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4255             : 
    4256             :   // Try to legalize the operands in case we need to swap the order to keep it
    4257             :   // valid.
    4258        4261 :   legalizeOperands(*LoHalf);
    4259        4261 :   legalizeOperands(*HiHalf);
    4260             : 
    4261             :   // Move all users of this moved vlaue.
    4262        4261 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4263        4261 : }
    4264             : 
    4265         446 : void SIInstrInfo::splitScalar64BitBinaryOp(
    4266             :     SetVectorType &Worklist, MachineInstr &Inst,
    4267             :     unsigned Opcode) const {
    4268         446 :   MachineBasicBlock &MBB = *Inst.getParent();
    4269         446 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4270             : 
    4271         446 :   MachineOperand &Dest = Inst.getOperand(0);
    4272             :   MachineOperand &Src0 = Inst.getOperand(1);
    4273             :   MachineOperand &Src1 = Inst.getOperand(2);
    4274             :   DebugLoc DL = Inst.getDebugLoc();
    4275             : 
    4276             :   MachineBasicBlock::iterator MII = Inst;
    4277             : 
    4278         446 :   const MCInstrDesc &InstDesc = get(Opcode);
    4279         446 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    4280         446 :     MRI.getRegClass(Src0.getReg()) :
    4281             :     &AMDGPU::SGPR_32RegClass;
    4282             : 
    4283         446 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4284         446 :   const TargetRegisterClass *Src1RC = Src1.isReg() ?
    4285         446 :     MRI.getRegClass(Src1.getReg()) :
    4286             :     &AMDGPU::SGPR_32RegClass;
    4287             : 
    4288         446 :   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
    4289             : 
    4290             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4291         446 :                                                        AMDGPU::sub0, Src0SubRC);
    4292             :   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4293         446 :                                                        AMDGPU::sub0, Src1SubRC);
    4294             : 
    4295         446 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    4296         446 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    4297         446 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    4298             : 
    4299         446 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    4300         446 :   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
    4301             :                               .add(SrcReg0Sub0)
    4302             :                               .add(SrcReg1Sub0);
    4303             : 
    4304             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4305         446 :                                                        AMDGPU::sub1, Src0SubRC);
    4306             :   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4307         446 :                                                        AMDGPU::sub1, Src1SubRC);
    4308             : 
    4309         446 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    4310         446 :   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
    4311             :                               .add(SrcReg0Sub1)
    4312             :                               .add(SrcReg1Sub1);
    4313             : 
    4314         446 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    4315        1338 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4316         446 :     .addReg(DestSub0)
    4317             :     .addImm(AMDGPU::sub0)
    4318         446 :     .addReg(DestSub1)
    4319             :     .addImm(AMDGPU::sub1);
    4320             : 
    4321         446 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4322             : 
    4323             :   // Try to legalize the operands in case we need to swap the order to keep it
    4324             :   // valid.
    4325         446 :   legalizeOperands(LoHalf);
    4326         446 :   legalizeOperands(HiHalf);
    4327             : 
    4328             :   // Move all users of this moved vlaue.
    4329         446 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4330         446 : }
    4331             : 
    4332          26 : void SIInstrInfo::splitScalar64BitBCNT(
    4333             :     SetVectorType &Worklist, MachineInstr &Inst) const {
    4334          26 :   MachineBasicBlock &MBB = *Inst.getParent();
    4335          26 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4336             : 
    4337             :   MachineBasicBlock::iterator MII = Inst;
    4338             :   DebugLoc DL = Inst.getDebugLoc();
    4339             : 
    4340          26 :   MachineOperand &Dest = Inst.getOperand(0);
    4341             :   MachineOperand &Src = Inst.getOperand(1);
    4342             : 
    4343          26 :   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
    4344          26 :   const TargetRegisterClass *SrcRC = Src.isReg() ?
    4345          26 :     MRI.getRegClass(Src.getReg()) :
    4346             :     &AMDGPU::SGPR_32RegClass;
    4347             : 
    4348          26 :   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4349          26 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4350             : 
    4351          26 :   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
    4352             : 
    4353             :   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    4354          26 :                                                       AMDGPU::sub0, SrcSubRC);
    4355             :   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    4356          26 :                                                       AMDGPU::sub1, SrcSubRC);
    4357             : 
    4358          26 :   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
    4359             : 
    4360          52 :   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
    4361             : 
    4362          26 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4363             : 
    4364             :   // We don't need to legalize operands here. src0 for etiher instruction can be
    4365             :   // an SGPR, and the second input is unused or determined here.
    4366          26 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4367          26 : }
    4368             : 
    4369        1812 : void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
    4370             :                                       MachineInstr &Inst) const {
    4371        1812 :   MachineBasicBlock &MBB = *Inst.getParent();
    4372        1812 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4373             :   MachineBasicBlock::iterator MII = Inst;
    4374             :   DebugLoc DL = Inst.getDebugLoc();
    4375             : 
    4376        1812 :   MachineOperand &Dest = Inst.getOperand(0);
    4377        1812 :   uint32_t Imm = Inst.getOperand(2).getImm();
    4378             :   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    4379        1812 :   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    4380             : 
    4381             :   (void) Offset;
    4382             : 
    4383             :   // Only sext_inreg cases handled.
    4384             :   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
    4385             :          Offset == 0 && "Not implemented");
    4386             : 
    4387        1812 :   if (BitWidth < 32) {
    4388        1806 :     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4389        1806 :     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4390        1806 :     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4391             : 
    4392        5418 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
    4393        1806 :         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
    4394             :         .addImm(0)
    4395        1806 :         .addImm(BitWidth);
    4396             : 
    4397        5418 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
    4398             :       .addImm(31)
    4399        1806 :       .addReg(MidRegLo);
    4400             : 
    4401        5418 :     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    4402        1806 :       .addReg(MidRegLo)
    4403             :       .addImm(AMDGPU::sub0)
    4404        1806 :       .addReg(MidRegHi)
    4405             :       .addImm(AMDGPU::sub1);
    4406             : 
    4407        1806 :     MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4408        1806 :     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4409             :     return;
    4410             :   }
    4411             : 
    4412             :   MachineOperand &Src = Inst.getOperand(1);
    4413           6 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4414           6 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4415             : 
    4416          18 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
    4417             :     .addImm(31)
    4418           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0);
    4419             : 
    4420          18 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    4421           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0)
    4422             :     .addImm(AMDGPU::sub0)
    4423           6 :     .addReg(TmpReg)
    4424             :     .addImm(AMDGPU::sub1);
    4425             : 
    4426           6 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4427           6 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4428             : }
    4429             : 
    4430      101004 : void SIInstrInfo::addUsersToMoveToVALUWorklist(
    4431             :   unsigned DstReg,
    4432             :   MachineRegisterInfo &MRI,
    4433             :   SetVectorType &Worklist) const {
    4434      101004 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
    4435      225364 :          E = MRI.use_end(); I != E;) {
    4436      124360 :     MachineInstr &UseMI = *I->getParent();
    4437      124360 :     if (!canReadVGPR(UseMI, I.getOperandNo())) {
    4438       65237 :       Worklist.insert(&UseMI);
    4439             : 
    4440             :       do {
    4441             :         ++I;
    4442       65373 :       } while (I != E && I->getParent() == &UseMI);
    4443             :     } else {
    4444             :       ++I;
    4445             :     }
    4446             :   }
    4447      101004 : }
    4448             : 
    4449         215 : void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
    4450             :                                  MachineRegisterInfo &MRI,
    4451             :                                  MachineInstr &Inst) const {
    4452         215 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4453         215 :   MachineBasicBlock *MBB = Inst.getParent();
    4454         215 :   MachineOperand &Src0 = Inst.getOperand(1);
    4455             :   MachineOperand &Src1 = Inst.getOperand(2);
    4456             :   const DebugLoc &DL = Inst.getDebugLoc();
    4457             : 
    4458         430 :   switch (Inst.getOpcode()) {
    4459             :   case AMDGPU::S_PACK_LL_B32_B16: {
    4460         207 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4461         207 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4462             : 
    4463             :     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
    4464             :     // 0.
    4465         621 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4466             :       .addImm(0xffff);
    4467             : 
    4468         621 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
    4469         207 :       .addReg(ImmReg, RegState::Kill)
    4470             :       .add(Src0);
    4471             : 
    4472         621 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
    4473             :       .add(Src1)
    4474             :       .addImm(16)
    4475         207 :       .addReg(TmpReg, RegState::Kill);
    4476         207 :     break;
    4477             :   }
    4478             :   case AMDGPU::S_PACK_LH_B32_B16: {
    4479           6 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4480          18 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4481             :       .addImm(0xffff);
    4482          18 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
    4483           6 :       .addReg(ImmReg, RegState::Kill)
    4484             :       .add(Src0)
    4485             :       .add(Src1);
    4486           6 :     break;
    4487             :   }
    4488             :   case AMDGPU::S_PACK_HH_B32_B16: {
    4489           2 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4490           2 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4491           4 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
    4492             :       .addImm(16)
    4493             :       .add(Src0);
    4494           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4495             :       .addImm(0xffff0000);
    4496           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
    4497             :       .add(Src1)
    4498           2 :       .addReg(ImmReg, RegState::Kill)
    4499           2 :       .addReg(TmpReg, RegState::Kill);
    4500           2 :     break;
    4501             :   }
    4502           0 :   default:
    4503           0 :     llvm_unreachable("unhandled s_pack_* instruction");
    4504             :   }
    4505             : 
    4506         215 :   MachineOperand &Dest = Inst.getOperand(0);
    4507         215 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4508         215 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4509         215 : }
    4510             : 
    4511       25964 : void SIInstrInfo::addSCCDefUsersToVALUWorklist(
    4512             :     MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
    4513             :   // This assumes that all the users of SCC are in the same block
    4514             :   // as the SCC def.
    4515             :   for (MachineInstr &MI :
    4516             :        make_range(MachineBasicBlock::iterator(SCCDefInst),
    4517      683301 :                       SCCDefInst.getParent()->end())) {
    4518             :     // Exit if we find another SCC def.
    4519      622839 :     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
    4520       17430 :       return;
    4521             : 
    4522      605409 :     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
    4523          75 :       Worklist.insert(&MI);
    4524             :   }
    4525             : }
    4526             : 
    4527       93885 : const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    4528             :   const MachineInstr &Inst) const {
    4529       93885 :   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
    4530             : 
    4531      187770 :   switch (Inst.getOpcode()) {
    4532             :   // For target instructions, getOpRegClass just returns the virtual register
    4533             :   // class associated with the operand, so we need to find an equivalent VGPR
    4534             :   // register class in order to move the instruction to the VALU.
    4535       65392 :   case AMDGPU::COPY:
    4536             :   case AMDGPU::PHI:
    4537             :   case AMDGPU::REG_SEQUENCE:
    4538             :   case AMDGPU::INSERT_SUBREG:
    4539             :   case AMDGPU::WQM:
    4540             :   case AMDGPU::WWM:
    4541       65392 :     if (RI.hasVGPRs(NewDstRC))
    4542             :       return nullptr;
    4543             : 
    4544       65392 :     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
    4545       65392 :     if (!NewDstRC)
    4546             :       return nullptr;
    4547       65392 :     return NewDstRC;
    4548             :   default:
    4549             :     return NewDstRC;
    4550             :   }
    4551             : }
    4552             : 
    4553             : // Find the one SGPR operand we are allowed to use.
    4554       59658 : unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
    4555             :                                    int OpIndices[3]) const {
    4556       59658 :   const MCInstrDesc &Desc = MI.getDesc();
    4557             : 
    4558             :   // Find the one SGPR operand we are allowed to use.
    4559             :   //
    4560             :   // First we need to consider the instruction's operand requirements before
    4561             :   // legalizing. Some operands are required to be SGPRs, such as implicit uses
    4562             :   // of VCC, but we are still bound by the constant bus requirement to only use
    4563             :   // one.
    4564             :   //
    4565             :   // If the operand's class is an SGPR, we can never move it.
    4566             : 
    4567       59658 :   unsigned SGPRReg = findImplicitSGPRRead(MI);
    4568       59658 :   if (SGPRReg != AMDGPU::NoRegister)
    4569             :     return SGPRReg;
    4570             : 
    4571       59526 :   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
    4572       59526 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    4573             : 
    4574      305722 :   for (unsigned i = 0; i < 3; ++i) {
    4575      173305 :     int Idx = OpIndices[i];
    4576      173305 :     if (Idx == -1)
    4577             :       break;
    4578             : 
    4579      131817 :     const MachineOperand &MO = MI.getOperand(Idx);
    4580      131817 :     if (!MO.isReg())
    4581       11825 :       continue;
    4582             : 
    4583             :     // Is this operand statically required to be an SGPR based on the operand
    4584             :     // constraints?
    4585      119992 :     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
    4586      119992 :     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
    4587      119992 :     if (IsRequiredSGPR)
    4588        8719 :       return MO.getReg();
    4589             : 
    4590             :     // If this could be a VGPR or an SGPR, Check the dynamic register class.
    4591      111273 :     unsigned Reg = MO.getReg();
    4592             :     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
    4593      111273 :     if (RI.isSGPRClass(RegRC))
    4594       48868 :       UsedSGPRs[i] = Reg;
    4595             :   }
    4596             : 
    4597             :   // We don't have a required SGPR operand, so we have a bit more freedom in
    4598             :   // selecting operands to move.
    4599             : 
    4600             :   // Try to select the most used SGPR. If an SGPR is equal to one of the
    4601             :   // others, we choose that.
    4602             :   //
    4603             :   // e.g.
    4604             :   // V_FMA_F32 v0, s0, s0, s0 -> No moves
    4605             :   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
    4606             : 
    4607             :   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
    4608             :   // prefer those.
    4609             : 
    4610       50807 :   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
    4611       22343 :     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
    4612             :       SGPRReg = UsedSGPRs[0];
    4613             :   }
    4614             : 
    4615       50807 :   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
    4616       16183 :     if (UsedSGPRs[1] == UsedSGPRs[2])
    4617             :       SGPRReg = UsedSGPRs[1];
    4618             :   }
    4619             : 
    4620             :   return SGPRReg;
    4621             : }
    4622             : 
    4623    13545450 : MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
    4624             :                                              unsigned OperandName) const {
    4625    27090900 :   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
    4626    13545450 :   if (Idx == -1)
    4627             :     return nullptr;
    4628             : 
    4629     4676448 :   return &MI.getOperand(Idx);
    4630             : }
    4631             : 
    4632       21695 : uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
    4633             :   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
    4634       43390 :   if (ST.isAmdHsaOS()) {
    4635             :     // Set ATC = 1. GFX9 doesn't have this bit.
    4636         683 :     if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
    4637             :       RsrcDataFormat |= (1ULL << 56);
    4638             : 
    4639             :     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
    4640             :     // BTW, it disables TC L2 and therefore decreases performance.
    4641         683 :     if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
    4642         345 :       RsrcDataFormat |= (2ULL << 59);
    4643             :   }
    4644             : 
    4645       21695 :   return RsrcDataFormat;
    4646             : }
    4647             : 
    4648         433 : uint64_t SIInstrInfo::getScratchRsrcWords23() const {
    4649         433 :   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
    4650             :                     AMDGPU::RSRC_TID_ENABLE |
    4651         433 :                     0xffffffff; // Size;
    4652             : 
    4653             :   // GFX9 doesn't have ELEMENT_SIZE.
    4654         433 :   if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
    4655         734 :     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
    4656         367 :     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
    4657             :   }
    4658             : 
    4659             :   // IndexStride = 64.
    4660         433 :   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
    4661             : 
    4662             :   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
    4663             :   // Clear them unless we want a huge stride.
    4664         433 :   if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4665         218 :     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
    4666             : 
    4667         433 :   return Rsrc23;
    4668             : }
    4669             : 
    4670          60 : bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
    4671          60 :   unsigned Opc = MI.getOpcode();
    4672             : 
    4673          60 :   return isSMRD(Opc);
    4674             : }
    4675             : 
    4676          14 : bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
    4677          14 :   unsigned Opc = MI.getOpcode();
    4678             : 
    4679          42 :   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
    4680             : }
    4681             : 
    4682        2807 : unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
    4683             :                                     int &FrameIndex) const {
    4684             :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    4685        5536 :   if (!Addr || !Addr->isFI())
    4686             :     return AMDGPU::NoRegister;
    4687             : 
    4688             :   assert(!MI.memoperands_empty() &&
    4689             :          (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
    4690             : 
    4691        2060 :   FrameIndex = Addr->getIndex();
    4692        2060 :   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
    4693             : }
    4694             : 
    4695          28 : unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
    4696             :                                         int &FrameIndex) const {
    4697             :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
    4698             :   assert(Addr && Addr->isFI());
    4699          28 :   FrameIndex = Addr->getIndex();
    4700          28 :   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
    4701             : }
    4702             : 
    4703       17378 : unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
    4704             :                                           int &FrameIndex) const {
    4705       17378 :   if (!MI.mayLoad())
    4706             :     return AMDGPU::NoRegister;
    4707             : 
    4708        2887 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4709        1407 :     return isStackAccess(MI, FrameIndex);
    4710             : 
    4711         734 :   if (isSGPRSpill(MI))
    4712          27 :     return isSGPRStackAccess(MI, FrameIndex);
    4713             : 
    4714             :   return AMDGPU::NoRegister;
    4715             : }
    4716             : 
    4717        8863 : unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
    4718             :                                          int &FrameIndex) const {
    4719        8863 :   if (!MI.mayStore())
    4720             :     return AMDGPU::NoRegister;
    4721             : 
    4722        2378 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4723        1400 :     return isStackAccess(MI, FrameIndex);
    4724             : 
    4725         489 :   if (isSGPRSpill(MI))
    4726           1 :     return isSGPRStackAccess(MI, FrameIndex);
    4727             : 
    4728             :   return AMDGPU::NoRegister;
    4729             : }
    4730             : 
    4731        1036 : unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
    4732             :   unsigned Size = 0;
    4733        1036 :   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
    4734        1036 :   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
    4735       11396 :   while (++I != E && I->isInsideBundle()) {
    4736             :     assert(!I->isBundle() && "No nested bundle!");
    4737        3108 :     Size += getInstSizeInBytes(*I);
    4738             :   }
    4739             : 
    4740        1036 :   return Size;
    4741             : }
    4742             : 
    4743      674794 : unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
    4744      674794 :   unsigned Opc = MI.getOpcode();
    4745             :   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
    4746      674794 :   unsigned DescSize = Desc.getSize();
    4747             : 
    4748             :   // If we have a definitive size, we can use it. Otherwise we need to inspect
    4749             :   // the operands to know the size.
    4750             :   //
    4751             :   // FIXME: Instructions that have a base 32-bit encoding report their size as
    4752             :   // 4, even though they are really 8 bytes if they have a literal operand.
    4753      674794 :   if (DescSize != 0 && DescSize != 4)
    4754             :     return DescSize;
    4755             : 
    4756             :   // 4-byte instructions may have a 32-bit literal encoded after them. Check
    4757             :   // operands that coud ever be literals.
    4758      777524 :   if (isVALU(MI) || isSALU(MI)) {
    4759      458122 :     if (isFixedSize(MI))
    4760             :       return DescSize;
    4761             : 
    4762      457032 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    4763      457032 :     if (Src0Idx == -1)
    4764             :       return 4; // No operands.
    4765             : 
    4766      675526 :     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
    4767             :       return 8;
    4768             : 
    4769      307019 :     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    4770      307019 :     if (Src1Idx == -1)
    4771             :       return 4;
    4772             : 
    4773      256798 :     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
    4774             :       return 8;
    4775             : 
    4776      117287 :     return 4;
    4777             :   }
    4778             : 
    4779       30926 :   if (DescSize == 4)
    4780             :     return 4;
    4781             : 
    4782             :   switch (Opc) {
    4783             :   case TargetOpcode::IMPLICIT_DEF:
    4784             :   case TargetOpcode::KILL:
    4785             :   case TargetOpcode::DBG_VALUE:
    4786             :   case TargetOpcode::EH_LABEL:
    4787             :     return 0;
    4788        1036 :   case TargetOpcode::BUNDLE:
    4789        1036 :     return getInstBundleSize(MI);
    4790        2801 :   case TargetOpcode::INLINEASM: {
    4791        2801 :     const MachineFunction *MF = MI.getParent()->getParent();
    4792        2801 :     const char *AsmStr = MI.getOperand(0).getSymbolName();
    4793        2801 :     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
    4794             :   }
    4795           0 :   default:
    4796           0 :     llvm_unreachable("unable to find instruction size");
    4797             :   }
    4798             : }
    4799             : 
    4800           0 : bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
    4801           0 :   if (!isFLAT(MI))
    4802             :     return false;
    4803             : 
    4804           0 :   if (MI.memoperands_empty())
    4805             :     return true;
    4806             : 
    4807           0 :   for (const MachineMemOperand *MMO : MI.memoperands()) {
    4808           0 :     if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
    4809             :       return true;
    4810             :   }
    4811             :   return false;
    4812             : }
    4813             : 
    4814           0 : bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
    4815           0 :   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
    4816             : }
    4817             : 
    4818           0 : void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
    4819             :                                             MachineBasicBlock *IfEnd) const {
    4820           0 :   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
    4821             :   assert(TI != IfEntry->end());
    4822             : 
    4823             :   MachineInstr *Branch = &(*TI);
    4824           0 :   MachineFunction *MF = IfEntry->getParent();
    4825           0 :   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
    4826             : 
    4827           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    4828           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4829             :     MachineInstr *SIIF =
    4830           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
    4831           0 :             .add(Branch->getOperand(0))
    4832           0 :             .add(Branch->getOperand(1));
    4833             :     MachineInstr *SIEND =
    4834           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
    4835           0 :             .addReg(DstReg);
    4836             : 
    4837           0 :     IfEntry->erase(TI);
    4838             :     IfEntry->insert(IfEntry->end(), SIIF);
    4839           0 :     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
    4840             :   }
    4841           0 : }
    4842             : 
    4843           0 : void SIInstrInfo::convertNonUniformLoopRegion(
    4844             :     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
    4845           0 :   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
    4846             :   // We expect 2 terminators, one conditional and one unconditional.
    4847             :   assert(TI != LoopEnd->end());
    4848             : 
    4849             :   MachineInstr *Branch = &(*TI);
    4850           0 :   MachineFunction *MF = LoopEnd->getParent();
    4851           0 :   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
    4852             : 
    4853           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    4854             : 
    4855           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4856           0 :     unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4857             :     MachineInstrBuilder HeaderPHIBuilder =
    4858           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
    4859             :     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
    4860             :                                           E = LoopEntry->pred_end();
    4861           0 :          PI != E; ++PI) {
    4862           0 :       if (*PI == LoopEnd) {
    4863           0 :         HeaderPHIBuilder.addReg(BackEdgeReg);
    4864             :       } else {
    4865             :         MachineBasicBlock *PMBB = *PI;
    4866           0 :         unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4867           0 :         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
    4868             :                              ZeroReg, 0);
    4869           0 :         HeaderPHIBuilder.addReg(ZeroReg);
    4870             :       }
    4871           0 :       HeaderPHIBuilder.addMBB(*PI);
    4872             :     }
    4873           0 :     MachineInstr *HeaderPhi = HeaderPHIBuilder;
    4874           0 :     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
    4875           0 :                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
    4876           0 :                                   .addReg(DstReg)
    4877           0 :                                   .add(Branch->getOperand(0));
    4878             :     MachineInstr *SILOOP =
    4879           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
    4880           0 :             .addReg(BackEdgeReg)
    4881           0 :             .addMBB(LoopEntry);
    4882             : 
    4883             :     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
    4884           0 :     LoopEnd->erase(TI);
    4885             :     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
    4886             :     LoopEnd->insert(LoopEnd->end(), SILOOP);
    4887             :   }
    4888           0 : }
    4889             : 
    4890             : ArrayRef<std::pair<int, const char *>>
    4891           5 : SIInstrInfo::getSerializableTargetIndices() const {
    4892             :   static const std::pair<int, const char *> TargetIndices[] = {
    4893             :       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
    4894             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
    4895             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
    4896             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
    4897             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
    4898           5 :   return makeArrayRef(TargetIndices);
    4899             : }
    4900             : 
    4901             : /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
    4902             : /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
    4903             : ScheduleHazardRecognizer *
    4904       14125 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
    4905             :                                             const ScheduleDAG *DAG) const {
    4906       14125 :   return new GCNHazardRecognizer(DAG->MF);
    4907             : }
    4908             : 
    4909             : /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
    4910             : /// pass.
    4911             : ScheduleHazardRecognizer *
    4912       18045 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
    4913       18045 :   return new GCNHazardRecognizer(MF);
    4914             : }
    4915             : 
    4916             : std::pair<unsigned, unsigned>
    4917          27 : SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
    4918          27 :   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
    4919             : }
    4920             : 
    4921             : ArrayRef<std::pair<unsigned, const char *>>
    4922          38 : SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
    4923             :   static const std::pair<unsigned, const char *> TargetFlags[] = {
    4924             :     { MO_GOTPCREL, "amdgpu-gotprel" },
    4925             :     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
    4926             :     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
    4927             :     { MO_REL32_LO, "amdgpu-rel32-lo" },
    4928             :     { MO_REL32_HI, "amdgpu-rel32-hi" }
    4929             :   };
    4930             : 
    4931          38 :   return makeArrayRef(TargetFlags);
    4932             : }
    4933             : 
    4934         792 : bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
    4935        2183 :   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
    4936        1431 :          MI.modifiesRegister(AMDGPU::EXEC, &RI);
    4937             : }
    4938             : 
    4939             : MachineInstrBuilder
    4940          92 : SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
    4941             :                            MachineBasicBlock::iterator I,
    4942             :                            const DebugLoc &DL,
    4943             :                            unsigned DestReg) const {
    4944          92 :   if (ST.hasAddNoCarry())
    4945          74 :     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
    4946             : 
    4947          55 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4948          55 :   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4949             :   MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
    4950             : 
    4951         165 :   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
    4952          55 :            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
    4953             : }
    4954             : 
    4955         122 : bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
    4956         122 :   switch (Opcode) {
    4957             :   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
    4958             :   case AMDGPU::SI_KILL_I1_TERMINATOR:
    4959             :     return true;
    4960         117 :   default:
    4961         117 :     return false;
    4962             :   }
    4963             : }
    4964             : 
    4965          84 : const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
    4966          84 :   switch (Opcode) {
    4967          52 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    4968         104 :     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
    4969          32 :   case AMDGPU::SI_KILL_I1_PSEUDO:
    4970          64 :     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
    4971           0 :   default:
    4972           0 :     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
    4973             :   }
    4974             : }
    4975             : 
    4976       15281 : bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
    4977       15281 :   if (!isSMRD(MI))
    4978             :     return false;
    4979             : 
    4980             :   // Check that it is using a buffer resource.
    4981       15281 :   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
    4982       15281 :   if (Idx == -1) // e.g. s_memtime
    4983             :     return false;
    4984             : 
    4985       15270 :   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
    4986       15270 :   return RCID == AMDGPU::SReg_128RegClassID;
    4987             : }
    4988             : 
    4989             : // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
    4990             : enum SIEncodingFamily {
    4991             :   SI = 0,
    4992             :   VI = 1,
    4993             :   SDWA = 2,
    4994             :   SDWA9 = 3,
    4995             :   GFX80 = 4,
    4996             :   GFX9 = 5
    4997             : };
    4998             : 
    4999             : static SIEncodingFamily subtargetEncodingFamily(const SISubtarget &ST) {
    5000     1183811 :   switch (ST.getGeneration()) {
    5001             :   case SISubtarget::SOUTHERN_ISLANDS:
    5002             :   case SISubtarget::SEA_ISLANDS:
    5003             :     return SIEncodingFamily::SI;
    5004      631250 :   case SISubtarget::VOLCANIC_ISLANDS:
    5005             :   case SISubtarget::GFX9:
    5006             :     return SIEncodingFamily::VI;
    5007             :   }
    5008           0 :   llvm_unreachable("Unknown subtarget generation!");
    5009             : }
    5010             : 
    5011     1183811 : int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
    5012     1183811 :   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
    5013             : 
    5014     2450121 :   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
    5015       82499 :     ST.getGeneration() >= SISubtarget::GFX9)
    5016             :     Gen = SIEncodingFamily::GFX9;
    5017             : 
    5018     1183811 :   if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
    5019        4724 :     Gen = ST.getGeneration() == SISubtarget::GFX9 ? SIEncodingFamily::SDWA9
    5020             :                                                       : SIEncodingFamily::SDWA;
    5021             :   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
    5022             :   // subtarget has UnpackedD16VMem feature.
    5023             :   // TODO: remove this when we discard GFX80 encoding.
    5024     1183811 :   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
    5025             :     Gen = SIEncodingFamily::GFX80;
    5026             : 
    5027     1183811 :   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
    5028             : 
    5029             :   // -1 means that Opcode is already a native instruction.
    5030     1183811 :   if (MCOp == -1)
    5031             :     return Opcode;
    5032             : 
    5033             :   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
    5034             :   // no encoding in the given subtarget generation.
    5035      991367 :   if (MCOp == (uint16_t)-1)
    5036             :     return -1;
    5037             : 
    5038      970401 :   return MCOp;
    5039      299229 : }

Generated by: LCOV version 1.13