LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInstrInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1865 2296 81.2 %
Date: 2018-10-20 13:21:21 Functions: 114 128 89.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// SI Implementation of TargetInstrInfo.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIInstrInfo.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUIntrinsicInfo.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "GCNHazardRecognizer.h"
      20             : #include "SIDefines.h"
      21             : #include "SIMachineFunctionInfo.h"
      22             : #include "SIRegisterInfo.h"
      23             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      24             : #include "Utils/AMDGPUBaseInfo.h"
      25             : #include "llvm/ADT/APInt.h"
      26             : #include "llvm/ADT/ArrayRef.h"
      27             : #include "llvm/ADT/SmallVector.h"
      28             : #include "llvm/ADT/StringRef.h"
      29             : #include "llvm/ADT/iterator_range.h"
      30             : #include "llvm/Analysis/AliasAnalysis.h"
      31             : #include "llvm/Analysis/MemoryLocation.h"
      32             : #include "llvm/Analysis/ValueTracking.h"
      33             : #include "llvm/CodeGen/MachineBasicBlock.h"
      34             : #include "llvm/CodeGen/MachineDominators.h"
      35             : #include "llvm/CodeGen/MachineFrameInfo.h"
      36             : #include "llvm/CodeGen/MachineFunction.h"
      37             : #include "llvm/CodeGen/MachineInstr.h"
      38             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      39             : #include "llvm/CodeGen/MachineInstrBundle.h"
      40             : #include "llvm/CodeGen/MachineMemOperand.h"
      41             : #include "llvm/CodeGen/MachineOperand.h"
      42             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      43             : #include "llvm/CodeGen/RegisterScavenging.h"
      44             : #include "llvm/CodeGen/ScheduleDAG.h"
      45             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      46             : #include "llvm/CodeGen/TargetOpcodes.h"
      47             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      48             : #include "llvm/IR/DebugLoc.h"
      49             : #include "llvm/IR/DiagnosticInfo.h"
      50             : #include "llvm/IR/Function.h"
      51             : #include "llvm/IR/InlineAsm.h"
      52             : #include "llvm/IR/LLVMContext.h"
      53             : #include "llvm/MC/MCInstrDesc.h"
      54             : #include "llvm/Support/Casting.h"
      55             : #include "llvm/Support/CommandLine.h"
      56             : #include "llvm/Support/Compiler.h"
      57             : #include "llvm/Support/ErrorHandling.h"
      58             : #include "llvm/Support/MachineValueType.h"
      59             : #include "llvm/Support/MathExtras.h"
      60             : #include "llvm/Target/TargetMachine.h"
      61             : #include <cassert>
      62             : #include <cstdint>
      63             : #include <iterator>
      64             : #include <utility>
      65             : 
      66             : using namespace llvm;
      67             : 
      68             : #define GET_INSTRINFO_CTOR_DTOR
      69             : #include "AMDGPUGenInstrInfo.inc"
      70             : 
      71             : namespace llvm {
      72             : namespace AMDGPU {
      73             : #define GET_D16ImageDimIntrinsics_IMPL
      74             : #define GET_ImageDimIntrinsicTable_IMPL
      75             : #define GET_RsrcIntrinsics_IMPL
      76             : #include "AMDGPUGenSearchableTables.inc"
      77             : }
      78             : }
      79             : 
      80             : 
      81             : // Must be at least 4 to be able to branch over minimum unconditional branch
      82             : // code. This is only for making it possible to write reasonably small tests for
      83             : // long branches.
      84             : static cl::opt<unsigned>
      85             : BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
      86             :                  cl::desc("Restrict range of branch instructions (DEBUG)"));
      87             : 
      88        2492 : SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
      89             :   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
      90        2492 :     RI(ST), ST(ST) {}
      91             : 
      92             : //===----------------------------------------------------------------------===//
      93             : // TargetInstrInfo callbacks
      94             : //===----------------------------------------------------------------------===//
      95             : 
      96             : static unsigned getNumOperandsNoGlue(SDNode *Node) {
      97        7246 :   unsigned N = Node->getNumOperands();
      98      471212 :   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
      99             :     --N;
     100             :   return N;
     101             : }
     102             : 
     103      428134 : static SDValue findChainOperand(SDNode *Load) {
     104      856268 :   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
     105             :   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
     106      428134 :   return LastOp;
     107             : }
     108             : 
     109             : /// Returns true if both nodes have the same value for the given
     110             : ///        operand \p Op, or if both nodes do not have this operand.
     111      600570 : static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
     112      600570 :   unsigned Opc0 = N0->getMachineOpcode();
     113      600570 :   unsigned Opc1 = N1->getMachineOpcode();
     114             : 
     115      600570 :   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
     116      600570 :   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
     117             : 
     118      600570 :   if (Op0Idx == -1 && Op1Idx == -1)
     119             :     return true;
     120             : 
     121             : 
     122      595783 :   if ((Op0Idx == -1 && Op1Idx != -1) ||
     123             :       (Op1Idx == -1 && Op0Idx != -1))
     124             :     return false;
     125             : 
     126             :   // getNamedOperandIdx returns the index for the MachineInstr's operands,
     127             :   // which includes the result as the first operand. We are indexing into the
     128             :   // MachineSDNode's operands, so we need to skip the result operand to get
     129             :   // the real index.
     130      595757 :   --Op0Idx;
     131      595757 :   --Op1Idx;
     132             : 
     133     1787271 :   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
     134             : }
     135             : 
     136       20938 : bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
     137             :                                                     AliasAnalysis *AA) const {
     138             :   // TODO: The generic check fails for VALU instructions that should be
     139             :   // rematerializable due to implicit reads of exec. We really want all of the
     140             :   // generic logic for this except for this.
     141       20938 :   switch (MI.getOpcode()) {
     142             :   case AMDGPU::V_MOV_B32_e32:
     143             :   case AMDGPU::V_MOV_B32_e64:
     144             :   case AMDGPU::V_MOV_B64_PSEUDO:
     145             :     return true;
     146             :   default:
     147             :     return false;
     148             :   }
     149             : }
     150             : 
     151      415754 : bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     152             :                                           int64_t &Offset0,
     153             :                                           int64_t &Offset1) const {
     154      415754 :   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
     155             :     return false;
     156             : 
     157             :   unsigned Opc0 = Load0->getMachineOpcode();
     158             :   unsigned Opc1 = Load1->getMachineOpcode();
     159             : 
     160             :   // Make sure both are actually loads.
     161     1048011 :   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
     162             :     return false;
     163             : 
     164      629476 :   if (isDS(Opc0) && isDS(Opc1)) {
     165             : 
     166             :     // FIXME: Handle this case:
     167        3623 :     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
     168             :       return false;
     169             : 
     170             :     // Check base reg.
     171        3607 :     if (Load0->getOperand(1) != Load1->getOperand(1))
     172             :       return false;
     173             : 
     174             :     // Check chain.
     175          16 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     176           0 :       return false;
     177             : 
     178             :     // Skip read2 / write2 variants for simplicity.
     179             :     // TODO: We should report true if the used offsets are adjacent (excluded
     180             :     // st64 versions).
     181          16 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
     182          16 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
     183             :       return false;
     184             : 
     185          32 :     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
     186          32 :     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
     187          16 :     return true;
     188             :   }
     189             : 
     190      311115 :   if (isSMRD(Opc0) && isSMRD(Opc1)) {
     191             :     // Skip time and cache invalidation instructions.
     192       25256 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
     193       25249 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
     194             :       return false;
     195             : 
     196             :     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
     197             : 
     198             :     // Check base reg.
     199       25242 :     if (Load0->getOperand(0) != Load1->getOperand(0))
     200             :       return false;
     201             : 
     202             :     const ConstantSDNode *Load0Offset =
     203             :         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
     204             :     const ConstantSDNode *Load1Offset =
     205             :         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
     206             : 
     207       18471 :     if (!Load0Offset || !Load1Offset)
     208             :       return false;
     209             : 
     210             :     // Check chain.
     211       18459 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     212           0 :       return false;
     213             : 
     214       18459 :     Offset0 = Load0Offset->getZExtValue();
     215       18459 :     Offset1 = Load1Offset->getZExtValue();
     216       18459 :     return true;
     217             :   }
     218             : 
     219             :   // MUBUF and MTBUF can access the same addresses.
     220      285859 :   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
     221             : 
     222             :     // MUBUF and MTBUF have vaddr at different indices.
     223      218278 :     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
     224      195592 :         findChainOperand(Load0) != findChainOperand(Load1) ||
     225      413870 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
     226      186700 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
     227       33012 :       return false;
     228             : 
     229      185266 :     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
     230      185266 :     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
     231             : 
     232      185266 :     if (OffIdx0 == -1 || OffIdx1 == -1)
     233             :       return false;
     234             : 
     235             :     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
     236             :     // inlcude the output in the operand list, but SDNodes don't, we need to
     237             :     // subtract the index by one.
     238      185266 :     --OffIdx0;
     239      185266 :     --OffIdx1;
     240             : 
     241      185266 :     SDValue Off0 = Load0->getOperand(OffIdx0);
     242      370532 :     SDValue Off1 = Load1->getOperand(OffIdx1);
     243             : 
     244             :     // The offset might be a FrameIndexSDNode.
     245             :     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
     246             :       return false;
     247             : 
     248      185266 :     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
     249      185266 :     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
     250      185266 :     return true;
     251             :   }
     252             : 
     253             :   return false;
     254             : }
     255             : 
     256             : static bool isStride64(unsigned Opc) {
     257             :   switch (Opc) {
     258             :   case AMDGPU::DS_READ2ST64_B32:
     259             :   case AMDGPU::DS_READ2ST64_B64:
     260             :   case AMDGPU::DS_WRITE2ST64_B32:
     261             :   case AMDGPU::DS_WRITE2ST64_B64:
     262             :     return true;
     263             :   default:
     264             :     return false;
     265             :   }
     266             : }
     267             : 
     268     1050942 : bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     269             :                                         int64_t &Offset,
     270             :                                         const TargetRegisterInfo *TRI) const {
     271     1050942 :   unsigned Opc = LdSt.getOpcode();
     272             : 
     273     1050942 :   if (isDS(LdSt)) {
     274             :     const MachineOperand *OffsetImm =
     275      129747 :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     276      129747 :     if (OffsetImm) {
     277             :       // Normal, single offset LDS instruction.
     278             :       const MachineOperand *AddrReg =
     279       63825 :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     280             : 
     281       63825 :       BaseReg = AddrReg->getReg();
     282       63825 :       Offset = OffsetImm->getImm();
     283       63825 :       return true;
     284             :     }
     285             : 
     286             :     // The 2 offset instructions use offset0 and offset1 instead. We can treat
     287             :     // these as a load with a single offset if the 2 offsets are consecutive. We
     288             :     // will use this for some partially aligned loads.
     289             :     const MachineOperand *Offset0Imm =
     290       65922 :         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
     291             :     const MachineOperand *Offset1Imm =
     292       65922 :         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
     293             : 
     294       65922 :     uint8_t Offset0 = Offset0Imm->getImm();
     295       65922 :     uint8_t Offset1 = Offset1Imm->getImm();
     296             : 
     297       65922 :     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
     298             :       // Each of these offsets is in element sized units, so we need to convert
     299             :       // to bytes of the individual reads.
     300             : 
     301             :       unsigned EltSize;
     302       51854 :       if (LdSt.mayLoad())
     303       13340 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
     304             :       else {
     305             :         assert(LdSt.mayStore());
     306       38514 :         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
     307       38514 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
     308             :       }
     309             : 
     310             :       if (isStride64(Opc))
     311           5 :         EltSize *= 64;
     312             : 
     313             :       const MachineOperand *AddrReg =
     314       51854 :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     315       51854 :       BaseReg = AddrReg->getReg();
     316       51854 :       Offset = EltSize * Offset0;
     317       51854 :       return true;
     318             :     }
     319             : 
     320             :     return false;
     321             :   }
     322             : 
     323      921195 :   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     324      811514 :     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
     325      811514 :     if (SOffset && SOffset->isReg())
     326             :       return false;
     327             : 
     328             :     const MachineOperand *AddrReg =
     329       95031 :         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     330       95031 :     if (!AddrReg)
     331             :       return false;
     332             : 
     333             :     const MachineOperand *OffsetImm =
     334        6176 :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     335        6176 :     BaseReg = AddrReg->getReg();
     336        6176 :     Offset = OffsetImm->getImm();
     337             : 
     338        6176 :     if (SOffset) // soffset can be an inline immediate.
     339        6176 :       Offset += SOffset->getImm();
     340             : 
     341        6176 :     return true;
     342             :   }
     343             : 
     344      109681 :   if (isSMRD(LdSt)) {
     345             :     const MachineOperand *OffsetImm =
     346       27118 :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     347       27118 :     if (!OffsetImm)
     348             :       return false;
     349             : 
     350             :     const MachineOperand *SBaseReg =
     351       27033 :         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
     352       27033 :     BaseReg = SBaseReg->getReg();
     353       27033 :     Offset = OffsetImm->getImm();
     354       27033 :     return true;
     355             :   }
     356             : 
     357       82563 :   if (isFLAT(LdSt)) {
     358       80558 :     const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     359       80558 :     if (VAddr) {
     360             :       // Can't analyze 2 offsets.
     361       80558 :       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
     362             :         return false;
     363             : 
     364       80558 :       BaseReg = VAddr->getReg();
     365             :     } else {
     366             :       // scratch instructions have either vaddr or saddr.
     367           0 :       BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
     368             :     }
     369             : 
     370       80558 :     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
     371       80558 :     return true;
     372             :   }
     373             : 
     374             :   return false;
     375             : }
     376             : 
     377       23662 : static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
     378             :                                   const MachineInstr &MI2, unsigned BaseReg2) {
     379       23662 :   if (BaseReg1 == BaseReg2)
     380             :     return true;
     381             : 
     382       20100 :   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
     383         263 :     return false;
     384             : 
     385        9788 :   auto MO1 = *MI1.memoperands_begin();
     386        9788 :   auto MO2 = *MI2.memoperands_begin();
     387        9788 :   if (MO1->getAddrSpace() != MO2->getAddrSpace())
     388             :     return false;
     389             : 
     390             :   auto Base1 = MO1->getValue();
     391             :   auto Base2 = MO2->getValue();
     392        3168 :   if (!Base1 || !Base2)
     393             :     return false;
     394        3112 :   const MachineFunction &MF = *MI1.getParent()->getParent();
     395        3112 :   const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
     396             :   Base1 = GetUnderlyingObject(Base1, DL);
     397             :   Base2 = GetUnderlyingObject(Base1, DL);
     398             : 
     399        3112 :   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
     400             :     return false;
     401             : 
     402        3100 :   return Base1 == Base2;
     403             : }
     404             : 
     405       23662 : bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
     406             :                                       unsigned BaseReg1,
     407             :                                       MachineInstr &SecondLdSt,
     408             :                                       unsigned BaseReg2,
     409             :                                       unsigned NumLoads) const {
     410       23662 :   if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
     411             :     return false;
     412             : 
     413             :   const MachineOperand *FirstDst = nullptr;
     414             :   const MachineOperand *SecondDst = nullptr;
     415             : 
     416       16711 :   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
     417       33151 :       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
     418        2122 :       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     419             :     const unsigned MaxGlobalLoadCluster = 6;
     420        2398 :     if (NumLoads > MaxGlobalLoadCluster)
     421             :       return false;
     422             : 
     423        2398 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     424        2398 :     if (!FirstDst)
     425         826 :       FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     426        2398 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
     427        2398 :     if (!SecondDst)
     428         826 :       SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     429       14313 :   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
     430       10964 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
     431       10964 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
     432        3349 :   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
     433        3204 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     434        3204 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     435             :   }
     436             : 
     437       16711 :   if (!FirstDst || !SecondDst)
     438             :     return false;
     439             : 
     440             :   // Try to limit clustering based on the total number of bytes loaded
     441             :   // rather than the number of instructions.  This is done to help reduce
     442             :   // register pressure.  The method used is somewhat inexact, though,
     443             :   // because it assumes that all loads in the cluster will load the
     444             :   // same number of bytes as FirstLdSt.
     445             : 
     446             :   // The unit of this value is bytes.
     447             :   // FIXME: This needs finer tuning.
     448             :   unsigned LoadClusterThreshold = 16;
     449             : 
     450             :   const MachineRegisterInfo &MRI =
     451       15167 :       FirstLdSt.getParent()->getParent()->getRegInfo();
     452       15167 :   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
     453             : 
     454       15167 :   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
     455             : }
     456             : 
     457             : // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
     458             : // the first 16 loads will be interleaved with the stores, and the next 16 will
     459             : // be clustered as expected. It should really split into 2 16 store batches.
     460             : //
     461             : // Loads are clustered until this returns false, rather than trying to schedule
     462             : // groups of stores. This also means we have to deal with saying different
     463             : // address space loads should be clustered, and ones which might cause bank
     464             : // conflicts.
     465             : //
     466             : // This might be deprecated so it might not be worth that much effort to fix.
     467       31587 : bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
     468             :                                           int64_t Offset0, int64_t Offset1,
     469             :                                           unsigned NumLoads) const {
     470             :   assert(Offset1 > Offset0 &&
     471             :          "Second offset should be larger than first offset!");
     472             :   // If we have less than 16 loads in a row, and the offsets are within 64
     473             :   // bytes, then schedule together.
     474             : 
     475             :   // A cacheline is 64 bytes (for global memory).
     476       31587 :   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
     477             : }
     478             : 
     479          10 : static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
     480             :                               MachineBasicBlock::iterator MI,
     481             :                               const DebugLoc &DL, unsigned DestReg,
     482             :                               unsigned SrcReg, bool KillSrc) {
     483          10 :   MachineFunction *MF = MBB.getParent();
     484             :   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
     485             :                                         "illegal SGPR to VGPR copy",
     486          10 :                                         DL, DS_Error);
     487          10 :   LLVMContext &C = MF->getFunction().getContext();
     488          10 :   C.diagnose(IllegalCopy);
     489             : 
     490          20 :   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
     491          10 :     .addReg(SrcReg, getKillRegState(KillSrc));
     492          10 : }
     493             : 
     494       59284 : void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     495             :                               MachineBasicBlock::iterator MI,
     496             :                               const DebugLoc &DL, unsigned DestReg,
     497             :                               unsigned SrcReg, bool KillSrc) const {
     498       59284 :   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
     499             : 
     500       59284 :   if (RC == &AMDGPU::VGPR_32RegClass) {
     501             :     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
     502             :            AMDGPU::SReg_32RegClass.contains(SrcReg));
     503       64470 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     504       32235 :       .addReg(SrcReg, getKillRegState(KillSrc));
     505       32235 :     return;
     506             :   }
     507             : 
     508       27049 :   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
     509             :       RC == &AMDGPU::SReg_32RegClass) {
     510       19591 :     if (SrcReg == AMDGPU::SCC) {
     511           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
     512             :           .addImm(-1)
     513             :           .addImm(0);
     514           0 :       return;
     515             :     }
     516             : 
     517       19591 :     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
     518           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     519           2 :       return;
     520             :     }
     521             : 
     522       39178 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     523       19589 :             .addReg(SrcReg, getKillRegState(KillSrc));
     524       19589 :     return;
     525             :   }
     526             : 
     527        7458 :   if (RC == &AMDGPU::SReg_64RegClass) {
     528        2236 :     if (DestReg == AMDGPU::VCC) {
     529          25 :       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     530          72 :         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
     531          24 :           .addReg(SrcReg, getKillRegState(KillSrc));
     532             :       } else {
     533             :         // FIXME: Hack until VReg_1 removed.
     534             :         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
     535           3 :         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
     536             :           .addImm(0)
     537           1 :           .addReg(SrcReg, getKillRegState(KillSrc));
     538             :       }
     539             : 
     540          25 :       return;
     541             :     }
     542             : 
     543        2211 :     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     544           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     545           2 :       return;
     546             :     }
     547             : 
     548        4418 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     549        2209 :             .addReg(SrcReg, getKillRegState(KillSrc));
     550        2209 :     return;
     551             :   }
     552             : 
     553        5222 :   if (DestReg == AMDGPU::SCC) {
     554             :     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
     555           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
     556           0 :       .addReg(SrcReg, getKillRegState(KillSrc))
     557             :       .addImm(0);
     558           0 :     return;
     559             :   }
     560             : 
     561             :   unsigned EltSize = 4;
     562             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     563        5222 :   if (RI.isSGPRClass(RC)) {
     564         198 :     if (RI.getRegSizeInBits(*RC) > 32) {
     565             :       Opcode =  AMDGPU::S_MOV_B64;
     566             :       EltSize = 8;
     567             :     } else {
     568             :       Opcode = AMDGPU::S_MOV_B32;
     569             :       EltSize = 4;
     570             :     }
     571             : 
     572         396 :     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
     573           6 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     574           6 :       return;
     575             :     }
     576             :   }
     577             : 
     578        5216 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
     579             :   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
     580             : 
     581       16018 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     582             :     unsigned SubIdx;
     583       10802 :     if (Forward)
     584        6988 :       SubIdx = SubIndices[Idx];
     585             :     else
     586        7628 :       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
     587             : 
     588             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     589       10802 :       get(Opcode), RI.getSubReg(DestReg, SubIdx));
     590             : 
     591       10802 :     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
     592             : 
     593       10802 :     if (Idx == 0)
     594        5216 :       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
     595             : 
     596       10802 :     bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
     597       10802 :     Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
     598             :   }
     599             : }
     600             : 
     601      320216 : int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
     602             :   int NewOpc;
     603             : 
     604             :   // Try to map original to commuted opcode
     605      320216 :   NewOpc = AMDGPU::getCommuteRev(Opcode);
     606      320216 :   if (NewOpc != -1)
     607             :     // Check if the commuted (REV) opcode exists on the target.
     608       19589 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     609             : 
     610             :   // Try to map commuted to original opcode
     611      300627 :   NewOpc = AMDGPU::getCommuteOrig(Opcode);
     612      300627 :   if (NewOpc != -1)
     613             :     // Check if the original (non-REV) opcode exists on the target.
     614       45164 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     615             : 
     616      255463 :   return Opcode;
     617             : }
     618             : 
     619           0 : void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
     620             :                                        MachineBasicBlock::iterator MI,
     621             :                                        const DebugLoc &DL, unsigned DestReg,
     622             :                                        int64_t Value) const {
     623           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     624             :   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
     625           0 :   if (RegClass == &AMDGPU::SReg_32RegClass ||
     626           0 :       RegClass == &AMDGPU::SGPR_32RegClass ||
     627           0 :       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
     628             :       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
     629           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     630             :       .addImm(Value);
     631           0 :     return;
     632             :   }
     633             : 
     634           0 :   if (RegClass == &AMDGPU::SReg_64RegClass ||
     635           0 :       RegClass == &AMDGPU::SGPR_64RegClass ||
     636             :       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
     637           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     638             :       .addImm(Value);
     639           0 :     return;
     640             :   }
     641             : 
     642           0 :   if (RegClass == &AMDGPU::VGPR_32RegClass) {
     643           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     644             :       .addImm(Value);
     645           0 :     return;
     646             :   }
     647           0 :   if (RegClass == &AMDGPU::VReg_64RegClass) {
     648           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
     649             :       .addImm(Value);
     650           0 :     return;
     651             :   }
     652             : 
     653             :   unsigned EltSize = 4;
     654             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     655           0 :   if (RI.isSGPRClass(RegClass)) {
     656           0 :     if (RI.getRegSizeInBits(*RegClass) > 32) {
     657             :       Opcode =  AMDGPU::S_MOV_B64;
     658             :       EltSize = 8;
     659             :     } else {
     660             :       Opcode = AMDGPU::S_MOV_B32;
     661             :       EltSize = 4;
     662             :     }
     663             :   }
     664             : 
     665           0 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
     666           0 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     667           0 :     int64_t IdxValue = Idx == 0 ? Value : 0;
     668             : 
     669             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     670           0 :       get(Opcode), RI.getSubReg(DestReg, Idx));
     671             :     Builder.addImm(IdxValue);
     672             :   }
     673             : }
     674             : 
     675             : const TargetRegisterClass *
     676           0 : SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
     677           0 :   return &AMDGPU::VGPR_32RegClass;
     678             : }
     679             : 
     680           0 : void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     681             :                                      MachineBasicBlock::iterator I,
     682             :                                      const DebugLoc &DL, unsigned DstReg,
     683             :                                      ArrayRef<MachineOperand> Cond,
     684             :                                      unsigned TrueReg,
     685             :                                      unsigned FalseReg) const {
     686           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     687             :   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
     688             :          "Not a VGPR32 reg");
     689             : 
     690           0 :   if (Cond.size() == 1) {
     691           0 :     unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     692           0 :     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     693             :       .add(Cond[0]);
     694           0 :     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     695           0 :       .addReg(FalseReg)
     696           0 :       .addReg(TrueReg)
     697           0 :       .addReg(SReg);
     698           0 :   } else if (Cond.size() == 2) {
     699             :     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
     700           0 :     switch (Cond[0].getImm()) {
     701             :     case SIInstrInfo::SCC_TRUE: {
     702           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     703           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     704             :         .addImm(-1)
     705             :         .addImm(0);
     706           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     707           0 :         .addReg(FalseReg)
     708           0 :         .addReg(TrueReg)
     709           0 :         .addReg(SReg);
     710           0 :       break;
     711             :     }
     712             :     case SIInstrInfo::SCC_FALSE: {
     713           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     714           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     715             :         .addImm(0)
     716             :         .addImm(-1);
     717           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     718           0 :         .addReg(FalseReg)
     719           0 :         .addReg(TrueReg)
     720           0 :         .addReg(SReg);
     721           0 :       break;
     722             :     }
     723           0 :     case SIInstrInfo::VCCNZ: {
     724           0 :       MachineOperand RegOp = Cond[1];
     725             :       RegOp.setImplicit(false);
     726           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     727           0 :       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     728             :         .add(RegOp);
     729           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     730           0 :           .addReg(FalseReg)
     731           0 :           .addReg(TrueReg)
     732           0 :           .addReg(SReg);
     733             :       break;
     734             :     }
     735           0 :     case SIInstrInfo::VCCZ: {
     736           0 :       MachineOperand RegOp = Cond[1];
     737             :       RegOp.setImplicit(false);
     738           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     739           0 :       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
     740             :         .add(RegOp);
     741           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     742           0 :           .addReg(TrueReg)
     743           0 :           .addReg(FalseReg)
     744           0 :           .addReg(SReg);
     745             :       break;
     746             :     }
     747             :     case SIInstrInfo::EXECNZ: {
     748           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     749           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     750           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     751             :         .addImm(0);
     752           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     753             :         .addImm(-1)
     754             :         .addImm(0);
     755           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     756           0 :         .addReg(FalseReg)
     757           0 :         .addReg(TrueReg)
     758           0 :         .addReg(SReg);
     759           0 :       break;
     760             :     }
     761             :     case SIInstrInfo::EXECZ: {
     762           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
     763           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     764           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     765             :         .addImm(0);
     766           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     767             :         .addImm(0)
     768             :         .addImm(-1);
     769           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     770           0 :         .addReg(FalseReg)
     771           0 :         .addReg(TrueReg)
     772           0 :         .addReg(SReg);
     773           0 :       llvm_unreachable("Unhandled branch predicate EXECZ");
     774             :       break;
     775             :     }
     776           0 :     default:
     777           0 :       llvm_unreachable("invalid branch predicate");
     778             :     }
     779             :   } else {
     780           0 :     llvm_unreachable("Can only handle Cond size 1 or 2");
     781             :   }
     782           0 : }
     783             : 
     784           0 : unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
     785             :                                MachineBasicBlock::iterator I,
     786             :                                const DebugLoc &DL,
     787             :                                unsigned SrcReg, int Value) const {
     788           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     789           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     790           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
     791           0 :     .addImm(Value)
     792           0 :     .addReg(SrcReg);
     793             : 
     794           0 :   return Reg;
     795             : }
     796             : 
     797           0 : unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
     798             :                                MachineBasicBlock::iterator I,
     799             :                                const DebugLoc &DL,
     800             :                                unsigned SrcReg, int Value) const {
     801           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     802           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     803           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
     804           0 :     .addImm(Value)
     805           0 :     .addReg(SrcReg);
     806             : 
     807           0 :   return Reg;
     808             : }
     809             : 
     810       11269 : unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
     811             : 
     812       11269 :   if (RI.getRegSizeInBits(*DstRC) == 32) {
     813       10835 :     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     814         434 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
     815             :     return AMDGPU::S_MOV_B64;
     816         428 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
     817         428 :     return  AMDGPU::V_MOV_B64_PSEUDO;
     818             :   }
     819             :   return AMDGPU::COPY;
     820             : }
     821             : 
     822             : static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
     823         702 :   switch (Size) {
     824             :   case 4:
     825             :     return AMDGPU::SI_SPILL_S32_SAVE;
     826         104 :   case 8:
     827             :     return AMDGPU::SI_SPILL_S64_SAVE;
     828          60 :   case 16:
     829             :     return AMDGPU::SI_SPILL_S128_SAVE;
     830          33 :   case 32:
     831             :     return AMDGPU::SI_SPILL_S256_SAVE;
     832           8 :   case 64:
     833             :     return AMDGPU::SI_SPILL_S512_SAVE;
     834           0 :   default:
     835           0 :     llvm_unreachable("unknown register size");
     836             :   }
     837             : }
     838             : 
     839             : static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     840        1240 :   switch (Size) {
     841             :   case 4:
     842             :     return AMDGPU::SI_SPILL_V32_SAVE;
     843           9 :   case 8:
     844             :     return AMDGPU::SI_SPILL_V64_SAVE;
     845           0 :   case 12:
     846             :     return AMDGPU::SI_SPILL_V96_SAVE;
     847         669 :   case 16:
     848             :     return AMDGPU::SI_SPILL_V128_SAVE;
     849           0 :   case 32:
     850             :     return AMDGPU::SI_SPILL_V256_SAVE;
     851           0 :   case 64:
     852             :     return AMDGPU::SI_SPILL_V512_SAVE;
     853           0 :   default:
     854           0 :     llvm_unreachable("unknown register size");
     855             :   }
     856             : }
     857             : 
     858        1942 : void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     859             :                                       MachineBasicBlock::iterator MI,
     860             :                                       unsigned SrcReg, bool isKill,
     861             :                                       int FrameIndex,
     862             :                                       const TargetRegisterClass *RC,
     863             :                                       const TargetRegisterInfo *TRI) const {
     864        1942 :   MachineFunction *MF = MBB.getParent();
     865        1942 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     866        1942 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     867             :   DebugLoc DL = MBB.findDebugLoc(MI);
     868             : 
     869             :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     870             :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     871             :   MachinePointerInfo PtrInfo
     872        1942 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     873             :   MachineMemOperand *MMO
     874        1942 :     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
     875             :                                Size, Align);
     876             :   unsigned SpillSize = TRI->getSpillSize(*RC);
     877             : 
     878        1942 :   if (RI.isSGPRClass(RC)) {
     879             :     MFI->setHasSpilledSGPRs();
     880             : 
     881             :     // We are only allowed to create one new instruction when spilling
     882             :     // registers, so we need to use pseudo instruction for spilling SGPRs.
     883         702 :     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
     884             : 
     885             :     // The SGPR spill/restore instructions only work on number sgprs, so we need
     886             :     // to make sure we are using the correct register class.
     887         702 :     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
     888          23 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     889          23 :       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     890             :     }
     891             : 
     892         702 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
     893         702 :       .addReg(SrcReg, getKillRegState(isKill)) // data
     894             :       .addFrameIndex(FrameIndex)               // addr
     895             :       .addMemOperand(MMO)
     896         702 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
     897         702 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
     898             :     // Add the scratch resource registers as implicit uses because we may end up
     899             :     // needing them, and need to ensure that the reserved registers are
     900             :     // correctly handled.
     901             : 
     902             :     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     903         702 :     if (ST.hasScalarStores()) {
     904             :       // m0 is used for offset to scalar stores if used to spill.
     905         370 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     906             :     }
     907             : 
     908             :     return;
     909             :   }
     910             : 
     911        1240 :   if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
     912           0 :     LLVMContext &Ctx = MF->getFunction().getContext();
     913           0 :     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
     914             :                   " spill register");
     915           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
     916           0 :       .addReg(SrcReg);
     917             : 
     918           0 :     return;
     919             :   }
     920             : 
     921             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
     922             : 
     923             :   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
     924             :   MFI->setHasSpilledVGPRs();
     925        3720 :   BuildMI(MBB, MI, DL, get(Opcode))
     926        1240 :     .addReg(SrcReg, getKillRegState(isKill)) // data
     927             :     .addFrameIndex(FrameIndex)               // addr
     928        1240 :     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
     929        1240 :     .addReg(MFI->getFrameOffsetReg())        // scratch_offset
     930             :     .addImm(0)                               // offset
     931             :     .addMemOperand(MMO);
     932             : }
     933             : 
     934             : static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
     935         693 :   switch (Size) {
     936             :   case 4:
     937             :     return AMDGPU::SI_SPILL_S32_RESTORE;
     938         101 :   case 8:
     939             :     return AMDGPU::SI_SPILL_S64_RESTORE;
     940          59 :   case 16:
     941             :     return AMDGPU::SI_SPILL_S128_RESTORE;
     942          33 :   case 32:
     943             :     return AMDGPU::SI_SPILL_S256_RESTORE;
     944           8 :   case 64:
     945             :     return AMDGPU::SI_SPILL_S512_RESTORE;
     946           0 :   default:
     947           0 :     llvm_unreachable("unknown register size");
     948             :   }
     949             : }
     950             : 
     951             : static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     952        1152 :   switch (Size) {
     953             :   case 4:
     954             :     return AMDGPU::SI_SPILL_V32_RESTORE;
     955           9 :   case 8:
     956             :     return AMDGPU::SI_SPILL_V64_RESTORE;
     957           0 :   case 12:
     958             :     return AMDGPU::SI_SPILL_V96_RESTORE;
     959         672 :   case 16:
     960             :     return AMDGPU::SI_SPILL_V128_RESTORE;
     961           0 :   case 32:
     962             :     return AMDGPU::SI_SPILL_V256_RESTORE;
     963           0 :   case 64:
     964             :     return AMDGPU::SI_SPILL_V512_RESTORE;
     965           0 :   default:
     966           0 :     llvm_unreachable("unknown register size");
     967             :   }
     968             : }
     969             : 
     970        1845 : void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     971             :                                        MachineBasicBlock::iterator MI,
     972             :                                        unsigned DestReg, int FrameIndex,
     973             :                                        const TargetRegisterClass *RC,
     974             :                                        const TargetRegisterInfo *TRI) const {
     975        1845 :   MachineFunction *MF = MBB.getParent();
     976        1845 :   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     977        1845 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     978             :   DebugLoc DL = MBB.findDebugLoc(MI);
     979             :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     980             :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     981             :   unsigned SpillSize = TRI->getSpillSize(*RC);
     982             : 
     983             :   MachinePointerInfo PtrInfo
     984        1845 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     985             : 
     986        1845 :   MachineMemOperand *MMO = MF->getMachineMemOperand(
     987             :     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
     988             : 
     989        1845 :   if (RI.isSGPRClass(RC)) {
     990             :     // FIXME: Maybe this should not include a memoperand because it will be
     991             :     // lowered to non-memory instructions.
     992         693 :     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
     993         693 :     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
     994          23 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     995          23 :       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     996             :     }
     997             : 
     998             :     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
     999         693 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
    1000             :       .addFrameIndex(FrameIndex) // addr
    1001             :       .addMemOperand(MMO)
    1002         693 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
    1003         693 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
    1004             : 
    1005         693 :     if (ST.hasScalarStores()) {
    1006             :       // m0 is used for offset to scalar stores if used to spill.
    1007         368 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
    1008             :     }
    1009             : 
    1010             :     return;
    1011             :   }
    1012             : 
    1013        1152 :   if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
    1014           0 :     LLVMContext &Ctx = MF->getFunction().getContext();
    1015           0 :     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
    1016             :                   " restore register");
    1017           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
    1018             : 
    1019           0 :     return;
    1020             :   }
    1021             : 
    1022             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
    1023             : 
    1024             :   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
    1025        3456 :   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
    1026             :     .addFrameIndex(FrameIndex)        // vaddr
    1027        1152 :     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
    1028        1152 :     .addReg(MFI->getFrameOffsetReg()) // scratch_offset
    1029             :     .addImm(0)                        // offset
    1030             :     .addMemOperand(MMO);
    1031             : }
    1032             : 
    1033             : /// \param @Offset Offset in bytes of the FrameIndex being spilled
    1034           0 : unsigned SIInstrInfo::calculateLDSSpillAddress(
    1035             :     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
    1036             :     unsigned FrameOffset, unsigned Size) const {
    1037           0 :   MachineFunction *MF = MBB.getParent();
    1038           0 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    1039           0 :   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
    1040             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1041           0 :   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
    1042           0 :   unsigned WavefrontSize = ST.getWavefrontSize();
    1043             : 
    1044           0 :   unsigned TIDReg = MFI->getTIDReg();
    1045           0 :   if (!MFI->hasCalculatedTID()) {
    1046           0 :     MachineBasicBlock &Entry = MBB.getParent()->front();
    1047             :     MachineBasicBlock::iterator Insert = Entry.front();
    1048             :     DebugLoc DL = Insert->getDebugLoc();
    1049             : 
    1050           0 :     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
    1051             :                                    *MF);
    1052           0 :     if (TIDReg == AMDGPU::NoRegister)
    1053             :       return TIDReg;
    1054             : 
    1055           0 :     if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
    1056             :         WorkGroupSize > WavefrontSize) {
    1057             :       unsigned TIDIGXReg
    1058             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
    1059             :       unsigned TIDIGYReg
    1060             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
    1061             :       unsigned TIDIGZReg
    1062             :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
    1063             :       unsigned InputPtrReg =
    1064             :           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    1065           0 :       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
    1066           0 :         if (!Entry.isLiveIn(Reg))
    1067             :           Entry.addLiveIn(Reg);
    1068             :       }
    1069             : 
    1070           0 :       RS->enterBasicBlock(Entry);
    1071             :       // FIXME: Can we scavenge an SReg_64 and access the subregs?
    1072             :       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    1073             :       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    1074           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
    1075           0 :               .addReg(InputPtrReg)
    1076             :               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
    1077           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
    1078           0 :               .addReg(InputPtrReg)
    1079             :               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
    1080             : 
    1081             :       // NGROUPS.X * NGROUPS.Y
    1082           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
    1083           0 :               .addReg(STmp1)
    1084           0 :               .addReg(STmp0);
    1085             :       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
    1086           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
    1087           0 :               .addReg(STmp1)
    1088           0 :               .addReg(TIDIGXReg);
    1089             :       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
    1090           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
    1091           0 :               .addReg(STmp0)
    1092           0 :               .addReg(TIDIGYReg)
    1093           0 :               .addReg(TIDReg);
    1094             :       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
    1095           0 :       getAddNoCarry(Entry, Insert, DL, TIDReg)
    1096           0 :         .addReg(TIDReg)
    1097           0 :         .addReg(TIDIGZReg);
    1098             :     } else {
    1099             :       // Get the wave id
    1100             :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
    1101           0 :               TIDReg)
    1102             :               .addImm(-1)
    1103             :               .addImm(0);
    1104             : 
    1105           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
    1106           0 :               TIDReg)
    1107             :               .addImm(-1)
    1108           0 :               .addReg(TIDReg);
    1109             :     }
    1110             : 
    1111           0 :     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
    1112           0 :             TIDReg)
    1113             :             .addImm(2)
    1114           0 :             .addReg(TIDReg);
    1115             :     MFI->setTIDReg(TIDReg);
    1116             :   }
    1117             : 
    1118             :   // Add FrameIndex to LDS offset
    1119           0 :   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
    1120           0 :   getAddNoCarry(MBB, MI, DL, TmpReg)
    1121           0 :     .addImm(LDSOffset)
    1122           0 :     .addReg(TIDReg);
    1123             : 
    1124           0 :   return TmpReg;
    1125             : }
    1126             : 
    1127        1774 : void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
    1128             :                                    MachineBasicBlock::iterator MI,
    1129             :                                    int Count) const {
    1130             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1131        3548 :   while (Count > 0) {
    1132             :     int Arg;
    1133        1774 :     if (Count >= 8)
    1134             :       Arg = 7;
    1135             :     else
    1136        1774 :       Arg = Count - 1;
    1137        1774 :     Count -= 8;
    1138        5322 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
    1139        1774 :             .addImm(Arg);
    1140             :   }
    1141        1774 : }
    1142             : 
    1143        1774 : void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
    1144             :                              MachineBasicBlock::iterator MI) const {
    1145        1774 :   insertWaitStates(MBB, MI, 1);
    1146        1774 : }
    1147             : 
    1148           0 : void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
    1149           0 :   auto MF = MBB.getParent();
    1150           0 :   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1151             : 
    1152             :   assert(Info->isEntryFunction());
    1153             : 
    1154           0 :   if (MBB.succ_empty()) {
    1155           0 :     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
    1156           0 :     if (HasNoTerminator)
    1157           0 :       BuildMI(MBB, MBB.end(), DebugLoc(),
    1158           0 :               get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
    1159             :   }
    1160           0 : }
    1161             : 
    1162      573761 : unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
    1163     1147522 :   switch (MI.getOpcode()) {
    1164             :   default: return 1; // FIXME: Do wait states equal cycles?
    1165             : 
    1166         891 :   case AMDGPU::S_NOP:
    1167         891 :     return MI.getOperand(0).getImm() + 1;
    1168             :   }
    1169             : }
    1170             : 
    1171      288198 : bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
    1172      288198 :   MachineBasicBlock &MBB = *MI.getParent();
    1173             :   DebugLoc DL = MBB.findDebugLoc(MI);
    1174      576396 :   switch (MI.getOpcode()) {
    1175             :   default: return TargetInstrInfo::expandPostRAPseudo(MI);
    1176           1 :   case AMDGPU::S_MOV_B64_term:
    1177             :     // This is only a terminator to get the correct spill code placement during
    1178             :     // register allocation.
    1179           1 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1180             :     break;
    1181             : 
    1182           0 :   case AMDGPU::S_XOR_B64_term:
    1183             :     // This is only a terminator to get the correct spill code placement during
    1184             :     // register allocation.
    1185           0 :     MI.setDesc(get(AMDGPU::S_XOR_B64));
    1186             :     break;
    1187             : 
    1188           0 :   case AMDGPU::S_ANDN2_B64_term:
    1189             :     // This is only a terminator to get the correct spill code placement during
    1190             :     // register allocation.
    1191           0 :     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
    1192             :     break;
    1193             : 
    1194         336 :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1195         336 :     unsigned Dst = MI.getOperand(0).getReg();
    1196         336 :     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
    1197         336 :     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
    1198             : 
    1199         336 :     const MachineOperand &SrcOp = MI.getOperand(1);
    1200             :     // FIXME: Will this work for 64-bit floating point immediates?
    1201             :     assert(!SrcOp.isFPImm());
    1202         336 :     if (SrcOp.isImm()) {
    1203         336 :       APInt Imm(64, SrcOp.getImm());
    1204         672 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1205         672 :         .addImm(Imm.getLoBits(32).getZExtValue())
    1206         336 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1207         672 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1208        1008 :         .addImm(Imm.getHiBits(32).getZExtValue())
    1209         336 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1210             :     } else {
    1211             :       assert(SrcOp.isReg());
    1212           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1213           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
    1214           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1215           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1216           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
    1217           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1218             :     }
    1219         336 :     MI.eraseFromParent();
    1220         336 :     break;
    1221             :   }
    1222          28 :   case AMDGPU::V_SET_INACTIVE_B32: {
    1223          56 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1224          28 :       .addReg(AMDGPU::EXEC);
    1225          56 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
    1226          28 :       .add(MI.getOperand(2));
    1227          56 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1228          28 :       .addReg(AMDGPU::EXEC);
    1229          28 :     MI.eraseFromParent();
    1230          28 :     break;
    1231             :   }
    1232           2 :   case AMDGPU::V_SET_INACTIVE_B64: {
    1233           4 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1234           2 :       .addReg(AMDGPU::EXEC);
    1235           2 :     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
    1236           4 :                                  MI.getOperand(0).getReg())
    1237           2 :       .add(MI.getOperand(2));
    1238           2 :     expandPostRAPseudo(*Copy);
    1239           4 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1240           2 :       .addReg(AMDGPU::EXEC);
    1241           2 :     MI.eraseFromParent();
    1242           2 :     break;
    1243             :   }
    1244          66 :   case AMDGPU::V_MOVRELD_B32_V1:
    1245             :   case AMDGPU::V_MOVRELD_B32_V2:
    1246             :   case AMDGPU::V_MOVRELD_B32_V4:
    1247             :   case AMDGPU::V_MOVRELD_B32_V8:
    1248             :   case AMDGPU::V_MOVRELD_B32_V16: {
    1249          66 :     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
    1250          66 :     unsigned VecReg = MI.getOperand(0).getReg();
    1251             :     bool IsUndef = MI.getOperand(1).isUndef();
    1252          66 :     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
    1253             :     assert(VecReg == MI.getOperand(1).getReg());
    1254             : 
    1255             :     MachineInstr *MovRel =
    1256          66 :         BuildMI(MBB, MI, DL, MovRelDesc)
    1257          66 :             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
    1258          66 :             .add(MI.getOperand(2))
    1259          66 :             .addReg(VecReg, RegState::ImplicitDefine)
    1260             :             .addReg(VecReg,
    1261         130 :                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
    1262             : 
    1263             :     const int ImpDefIdx =
    1264         132 :         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
    1265          66 :     const int ImpUseIdx = ImpDefIdx + 1;
    1266          66 :     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
    1267             : 
    1268          66 :     MI.eraseFromParent();
    1269          66 :     break;
    1270             :   }
    1271         611 :   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
    1272         611 :     MachineFunction &MF = *MBB.getParent();
    1273         611 :     unsigned Reg = MI.getOperand(0).getReg();
    1274         611 :     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
    1275         611 :     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
    1276             : 
    1277             :     // Create a bundle so these instructions won't be re-ordered by the
    1278             :     // post-RA scheduler.
    1279             :     MIBundleBuilder Bundler(MBB, MI);
    1280        1222 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
    1281             : 
    1282             :     // Add 32-bit offset from this instruction to the start of the
    1283             :     // constant data.
    1284        1222 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
    1285         611 :                        .addReg(RegLo)
    1286         611 :                        .add(MI.getOperand(1)));
    1287             : 
    1288        1222 :     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
    1289         611 :                                   .addReg(RegHi);
    1290         611 :     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
    1291             :       MIB.addImm(0);
    1292             :     else
    1293             :       MIB.add(MI.getOperand(2));
    1294             : 
    1295             :     Bundler.append(MIB);
    1296         611 :     finalizeBundle(MBB, Bundler.begin());
    1297             : 
    1298         611 :     MI.eraseFromParent();
    1299             :     break;
    1300             :   }
    1301          44 :   case AMDGPU::EXIT_WWM: {
    1302             :     // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
    1303             :     // is exited.
    1304          44 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1305             :     break;
    1306             :   }
    1307          37 :   case TargetOpcode::BUNDLE: {
    1308          37 :     if (!MI.mayLoad())
    1309             :       return false;
    1310             : 
    1311             :     // If it is a load it must be a memory clause
    1312          37 :     for (MachineBasicBlock::instr_iterator I = MI.getIterator();
    1313         145 :          I->isBundledWithSucc(); ++I) {
    1314         108 :       I->unbundleFromSucc();
    1315         647 :       for (MachineOperand &MO : I->operands())
    1316         539 :         if (MO.isReg())
    1317             :           MO.setIsInternalRead(false);
    1318             :     }
    1319             : 
    1320          37 :     MI.eraseFromParent();
    1321          37 :     break;
    1322             :   }
    1323             :   }
    1324             :   return true;
    1325             : }
    1326             : 
    1327      263258 : bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
    1328             :                                       MachineOperand &Src0,
    1329             :                                       unsigned Src0OpName,
    1330             :                                       MachineOperand &Src1,
    1331             :                                       unsigned Src1OpName) const {
    1332      263258 :   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
    1333      263258 :   if (!Src0Mods)
    1334             :     return false;
    1335             : 
    1336       53463 :   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
    1337             :   assert(Src1Mods &&
    1338             :          "All commutable instructions have both src0 and src1 modifiers");
    1339             : 
    1340       53463 :   int Src0ModsVal = Src0Mods->getImm();
    1341       53463 :   int Src1ModsVal = Src1Mods->getImm();
    1342             : 
    1343       53463 :   Src1Mods->setImm(Src0ModsVal);
    1344       53463 :   Src0Mods->setImm(Src1ModsVal);
    1345       53463 :   return true;
    1346             : }
    1347             : 
    1348       42577 : static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
    1349             :                                              MachineOperand &RegOp,
    1350             :                                              MachineOperand &NonRegOp) {
    1351       42577 :   unsigned Reg = RegOp.getReg();
    1352             :   unsigned SubReg = RegOp.getSubReg();
    1353             :   bool IsKill = RegOp.isKill();
    1354             :   bool IsDead = RegOp.isDead();
    1355             :   bool IsUndef = RegOp.isUndef();
    1356             :   bool IsDebug = RegOp.isDebug();
    1357             : 
    1358       42577 :   if (NonRegOp.isImm())
    1359       42577 :     RegOp.ChangeToImmediate(NonRegOp.getImm());
    1360           0 :   else if (NonRegOp.isFI())
    1361           0 :     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
    1362             :   else
    1363             :     return nullptr;
    1364             : 
    1365       42577 :   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
    1366             :   NonRegOp.setSubReg(SubReg);
    1367             : 
    1368       42577 :   return &MI;
    1369             : }
    1370             : 
    1371      315266 : MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
    1372             :                                                   unsigned Src0Idx,
    1373             :                                                   unsigned Src1Idx) const {
    1374             :   assert(!NewMI && "this should never be used");
    1375             : 
    1376      315266 :   unsigned Opc = MI.getOpcode();
    1377      315266 :   int CommutedOpcode = commuteOpcode(Opc);
    1378      315266 :   if (CommutedOpcode == -1)
    1379             :     return nullptr;
    1380             : 
    1381             :   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
    1382             :            static_cast<int>(Src0Idx) &&
    1383             :          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
    1384             :            static_cast<int>(Src1Idx) &&
    1385             :          "inconsistency with findCommutedOpIndices");
    1386             : 
    1387      294230 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    1388             :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    1389             : 
    1390             :   MachineInstr *CommutedMI = nullptr;
    1391      294230 :   if (Src0.isReg() && Src1.isReg()) {
    1392      235900 :     if (isOperandLegal(MI, Src1Idx, &Src0)) {
    1393             :       // Be sure to copy the source modifiers to the right place.
    1394             :       CommutedMI
    1395      220681 :         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
    1396             :     }
    1397             : 
    1398       58330 :   } else if (Src0.isReg() && !Src1.isReg()) {
    1399             :     // src0 should always be able to support any operand type, so no need to
    1400             :     // check operand legality.
    1401       25063 :     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
    1402       33267 :   } else if (!Src0.isReg() && Src1.isReg()) {
    1403       33256 :     if (isOperandLegal(MI, Src1Idx, &Src0))
    1404       17514 :       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
    1405             :   } else {
    1406             :     // FIXME: Found two non registers to commute. This does happen.
    1407             :     return nullptr;
    1408             :   }
    1409             : 
    1410      263258 :   if (CommutedMI) {
    1411      263258 :     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
    1412             :                         Src1, AMDGPU::OpName::src1_modifiers);
    1413             : 
    1414      263258 :     CommutedMI->setDesc(get(CommutedOpcode));
    1415             :   }
    1416             : 
    1417             :   return CommutedMI;
    1418             : }
    1419             : 
    1420             : // This needs to be implemented because the source modifiers may be inserted
    1421             : // between the true commutable operands, and the base
    1422             : // TargetInstrInfo::commuteInstruction uses it.
    1423      361965 : bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
    1424             :                                         unsigned &SrcOpIdx1) const {
    1425      361965 :   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
    1426             : }
    1427             : 
    1428      363029 : bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
    1429             :                                         unsigned &SrcOpIdx1) const {
    1430      726058 :   if (!Desc.isCommutable())
    1431             :     return false;
    1432             : 
    1433      311837 :   unsigned Opc = Desc.getOpcode();
    1434      311837 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    1435      311837 :   if (Src0Idx == -1)
    1436             :     return false;
    1437             : 
    1438      311837 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    1439      311837 :   if (Src1Idx == -1)
    1440             :     return false;
    1441             : 
    1442      311837 :   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
    1443             : }
    1444             : 
    1445        1081 : bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
    1446             :                                         int64_t BrOffset) const {
    1447             :   // BranchRelaxation should never have to check s_setpc_b64 because its dest
    1448             :   // block is unanalyzable.
    1449             :   assert(BranchOp != AMDGPU::S_SETPC_B64);
    1450             : 
    1451             :   // Convert to dwords.
    1452        1081 :   BrOffset /= 4;
    1453             : 
    1454             :   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
    1455             :   // from the next instruction.
    1456        1081 :   BrOffset -= 1;
    1457             : 
    1458        1081 :   return isIntN(BranchOffsetBits, BrOffset);
    1459             : }
    1460             : 
    1461        1115 : MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
    1462             :   const MachineInstr &MI) const {
    1463        2230 :   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
    1464             :     // This would be a difficult analysis to perform, but can always be legal so
    1465             :     // there's no need to analyze it.
    1466             :     return nullptr;
    1467             :   }
    1468             : 
    1469        1115 :   return MI.getOperand(0).getMBB();
    1470             : }
    1471             : 
    1472          34 : unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
    1473             :                                            MachineBasicBlock &DestBB,
    1474             :                                            const DebugLoc &DL,
    1475             :                                            int64_t BrOffset,
    1476             :                                            RegScavenger *RS) const {
    1477             :   assert(RS && "RegScavenger required for long branching");
    1478             :   assert(MBB.empty() &&
    1479             :          "new block should be inserted for expanding unconditional branch");
    1480             :   assert(MBB.pred_size() == 1);
    1481             : 
    1482          34 :   MachineFunction *MF = MBB.getParent();
    1483          34 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    1484             : 
    1485             :   // FIXME: Virtual register workaround for RegScavenger not working with empty
    1486             :   // blocks.
    1487          34 :   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    1488             : 
    1489          34 :   auto I = MBB.end();
    1490             : 
    1491             :   // We need to compute the offset relative to the instruction immediately after
    1492             :   // s_getpc_b64. Insert pc arithmetic code before last terminator.
    1493          68 :   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
    1494             : 
    1495             :   // TODO: Handle > 32-bit block address.
    1496          34 :   if (BrOffset >= 0) {
    1497          52 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
    1498          26 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1499          26 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1500             :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
    1501          78 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
    1502          26 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1503          26 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1504             :       .addImm(0);
    1505             :   } else {
    1506             :     // Backwards branch.
    1507          16 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
    1508           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1509           8 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1510             :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
    1511          24 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
    1512           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1513           8 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1514             :       .addImm(0);
    1515             :   }
    1516             : 
    1517             :   // Insert the indirect branch after the other terminator.
    1518          34 :   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
    1519          34 :     .addReg(PCReg);
    1520             : 
    1521             :   // FIXME: If spilling is necessary, this will fail because this scavenger has
    1522             :   // no emergency stack slots. It is non-trivial to spill in this situation,
    1523             :   // because the restore code needs to be specially placed after the
    1524             :   // jump. BranchRelaxation then needs to be made aware of the newly inserted
    1525             :   // block.
    1526             :   //
    1527             :   // If a spill is needed for the pc register pair, we need to insert a spill
    1528             :   // restore block right before the destination block, and insert a short branch
    1529             :   // into the old destination block's fallthrough predecessor.
    1530             :   // e.g.:
    1531             :   //
    1532             :   // s_cbranch_scc0 skip_long_branch:
    1533             :   //
    1534             :   // long_branch_bb:
    1535             :   //   spill s[8:9]
    1536             :   //   s_getpc_b64 s[8:9]
    1537             :   //   s_add_u32 s8, s8, restore_bb
    1538             :   //   s_addc_u32 s9, s9, 0
    1539             :   //   s_setpc_b64 s[8:9]
    1540             :   //
    1541             :   // skip_long_branch:
    1542             :   //   foo;
    1543             :   //
    1544             :   // .....
    1545             :   //
    1546             :   // dest_bb_fallthrough_predecessor:
    1547             :   // bar;
    1548             :   // s_branch dest_bb
    1549             :   //
    1550             :   // restore_bb:
    1551             :   //  restore s[8:9]
    1552             :   //  fallthrough dest_bb
    1553             :   ///
    1554             :   // dest_bb:
    1555             :   //   buzz;
    1556             : 
    1557          34 :   RS->enterBasicBlockEnd(MBB);
    1558          34 :   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
    1559             :                                        MachineBasicBlock::iterator(GetPC), 0);
    1560          33 :   MRI.replaceRegWith(PCReg, Scav);
    1561          33 :   MRI.clearVirtRegs();
    1562          33 :   RS->setRegUsed(Scav);
    1563             : 
    1564          33 :   return 4 + 8 + 4 + 4;
    1565             : }
    1566             : 
    1567        1579 : unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
    1568        1579 :   switch (Cond) {
    1569             :   case SIInstrInfo::SCC_TRUE:
    1570             :     return AMDGPU::S_CBRANCH_SCC1;
    1571         416 :   case SIInstrInfo::SCC_FALSE:
    1572         416 :     return AMDGPU::S_CBRANCH_SCC0;
    1573         266 :   case SIInstrInfo::VCCNZ:
    1574         266 :     return AMDGPU::S_CBRANCH_VCCNZ;
    1575         243 :   case SIInstrInfo::VCCZ:
    1576         243 :     return AMDGPU::S_CBRANCH_VCCZ;
    1577         138 :   case SIInstrInfo::EXECNZ:
    1578         138 :     return AMDGPU::S_CBRANCH_EXECNZ;
    1579          93 :   case SIInstrInfo::EXECZ:
    1580          93 :     return AMDGPU::S_CBRANCH_EXECZ;
    1581           0 :   default:
    1582           0 :     llvm_unreachable("invalid branch predicate");
    1583             :   }
    1584             : }
    1585             : 
    1586      949677 : SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
    1587             :   switch (Opcode) {
    1588             :   case AMDGPU::S_CBRANCH_SCC0:
    1589             :     return SCC_FALSE;
    1590             :   case AMDGPU::S_CBRANCH_SCC1:
    1591             :     return SCC_TRUE;
    1592             :   case AMDGPU::S_CBRANCH_VCCNZ:
    1593             :     return VCCNZ;
    1594             :   case AMDGPU::S_CBRANCH_VCCZ:
    1595             :     return VCCZ;
    1596             :   case AMDGPU::S_CBRANCH_EXECNZ:
    1597             :     return EXECNZ;
    1598             :   case AMDGPU::S_CBRANCH_EXECZ:
    1599             :     return EXECZ;
    1600             :   default:
    1601             :     return INVALID_BR;
    1602             :   }
    1603             : }
    1604             : 
    1605      981361 : bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
    1606             :                                     MachineBasicBlock::iterator I,
    1607             :                                     MachineBasicBlock *&TBB,
    1608             :                                     MachineBasicBlock *&FBB,
    1609             :                                     SmallVectorImpl<MachineOperand> &Cond,
    1610             :                                     bool AllowModify) const {
    1611     1962722 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1612             :     // Unconditional Branch
    1613       31684 :     TBB = I->getOperand(0).getMBB();
    1614       31684 :     return false;
    1615             :   }
    1616             : 
    1617             :   MachineBasicBlock *CondBB = nullptr;
    1618             : 
    1619      949677 :   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    1620           0 :     CondBB = I->getOperand(1).getMBB();
    1621           0 :     Cond.push_back(I->getOperand(0));
    1622             :   } else {
    1623      949677 :     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
    1624      949677 :     if (Pred == INVALID_BR)
    1625             :       return true;
    1626             : 
    1627       35095 :     CondBB = I->getOperand(0).getMBB();
    1628       70190 :     Cond.push_back(MachineOperand::CreateImm(Pred));
    1629       70190 :     Cond.push_back(I->getOperand(1)); // Save the branch register.
    1630             :   }
    1631             :   ++I;
    1632             : 
    1633       35095 :   if (I == MBB.end()) {
    1634             :     // Conditional branch followed by fall-through.
    1635       17841 :     TBB = CondBB;
    1636       17841 :     return false;
    1637             :   }
    1638             : 
    1639       34508 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1640       17251 :     TBB = CondBB;
    1641       17251 :     FBB = I->getOperand(0).getMBB();
    1642       17251 :     return false;
    1643             :   }
    1644             : 
    1645             :   return true;
    1646             : }
    1647             : 
    1648     1036569 : bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
    1649             :                                 MachineBasicBlock *&FBB,
    1650             :                                 SmallVectorImpl<MachineOperand> &Cond,
    1651             :                                 bool AllowModify) const {
    1652     1036569 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1653     1036569 :   if (I == MBB.end())
    1654             :     return false;
    1655             : 
    1656     1965166 :   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
    1657      965576 :     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
    1658             : 
    1659             :   ++I;
    1660             : 
    1661             :   // TODO: Should be able to treat as fallthrough?
    1662       17007 :   if (I == MBB.end())
    1663             :     return true;
    1664             : 
    1665       15785 :   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
    1666             :     return true;
    1667             : 
    1668       15785 :   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
    1669             : 
    1670             :   // Specifically handle the case where the conditional branch is to the same
    1671             :   // destination as the mask branch. e.g.
    1672             :   //
    1673             :   // si_mask_branch BB8
    1674             :   // s_cbranch_execz BB8
    1675             :   // s_cbranch BB9
    1676             :   //
    1677             :   // This is required to understand divergent loops which may need the branches
    1678             :   // to be relaxed.
    1679       15785 :   if (TBB != MaskBrDest || Cond.empty())
    1680             :     return true;
    1681             : 
    1682         457 :   auto Pred = Cond[0].getImm();
    1683         457 :   return (Pred != EXECZ && Pred != EXECNZ);
    1684             : }
    1685             : 
    1686        2516 : unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
    1687             :                                    int *BytesRemoved) const {
    1688        2516 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1689             : 
    1690             :   unsigned Count = 0;
    1691             :   unsigned RemovedSize = 0;
    1692        5676 :   while (I != MBB.end()) {
    1693        3160 :     MachineBasicBlock::iterator Next = std::next(I);
    1694        6320 :     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
    1695             :       I = Next;
    1696             :       continue;
    1697             :     }
    1698             : 
    1699        3152 :     RemovedSize += getInstSizeInBytes(*I);
    1700        3152 :     I->eraseFromParent();
    1701        3152 :     ++Count;
    1702             :     I = Next;
    1703             :   }
    1704             : 
    1705        2516 :   if (BytesRemoved)
    1706          32 :     *BytesRemoved = RemovedSize;
    1707             : 
    1708        2516 :   return Count;
    1709             : }
    1710             : 
    1711             : // Copy the flags onto the implicit condition register operand.
    1712             : static void preserveCondRegFlags(MachineOperand &CondReg,
    1713             :                                  const MachineOperand &OrigCond) {
    1714             :   CondReg.setIsUndef(OrigCond.isUndef());
    1715             :   CondReg.setIsKill(OrigCond.isKill());
    1716             : }
    1717             : 
    1718        2249 : unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
    1719             :                                    MachineBasicBlock *TBB,
    1720             :                                    MachineBasicBlock *FBB,
    1721             :                                    ArrayRef<MachineOperand> Cond,
    1722             :                                    const DebugLoc &DL,
    1723             :                                    int *BytesAdded) const {
    1724        2249 :   if (!FBB && Cond.empty()) {
    1725         670 :     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1726             :       .addMBB(TBB);
    1727         670 :     if (BytesAdded)
    1728           0 :       *BytesAdded = 4;
    1729         670 :     return 1;
    1730             :   }
    1731             : 
    1732        1579 :   if(Cond.size() == 1 && Cond[0].isReg()) {
    1733           0 :      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
    1734             :        .add(Cond[0])
    1735             :        .addMBB(TBB);
    1736           0 :      return 1;
    1737             :   }
    1738             : 
    1739             :   assert(TBB && Cond[0].isImm());
    1740             : 
    1741             :   unsigned Opcode
    1742        1579 :     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
    1743             : 
    1744        1579 :   if (!FBB) {
    1745             :     Cond[1].isUndef();
    1746             :     MachineInstr *CondBr =
    1747        1481 :       BuildMI(&MBB, DL, get(Opcode))
    1748             :       .addMBB(TBB);
    1749             : 
    1750             :     // Copy the flags onto the implicit condition register operand.
    1751        1481 :     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
    1752             : 
    1753        1481 :     if (BytesAdded)
    1754           0 :       *BytesAdded = 4;
    1755        1481 :     return 1;
    1756             :   }
    1757             : 
    1758             :   assert(TBB && FBB);
    1759             : 
    1760             :   MachineInstr *CondBr =
    1761          98 :     BuildMI(&MBB, DL, get(Opcode))
    1762             :     .addMBB(TBB);
    1763          98 :   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1764             :     .addMBB(FBB);
    1765             : 
    1766          98 :   MachineOperand &CondReg = CondBr->getOperand(1);
    1767             :   CondReg.setIsUndef(Cond[1].isUndef());
    1768             :   CondReg.setIsKill(Cond[1].isKill());
    1769             : 
    1770          98 :   if (BytesAdded)
    1771          32 :       *BytesAdded = 8;
    1772             : 
    1773             :   return 2;
    1774             : }
    1775             : 
    1776        1296 : bool SIInstrInfo::reverseBranchCondition(
    1777             :   SmallVectorImpl<MachineOperand> &Cond) const {
    1778        1296 :   if (Cond.size() != 2) {
    1779             :     return true;
    1780             :   }
    1781             : 
    1782        1296 :   if (Cond[0].isImm()) {
    1783        1296 :     Cond[0].setImm(-Cond[0].getImm());
    1784        1296 :     return false;
    1785             :   }
    1786             : 
    1787             :   return true;
    1788             : }
    1789             : 
    1790          22 : bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
    1791             :                                   ArrayRef<MachineOperand> Cond,
    1792             :                                   unsigned TrueReg, unsigned FalseReg,
    1793             :                                   int &CondCycles,
    1794             :                                   int &TrueCycles, int &FalseCycles) const {
    1795          22 :   switch (Cond[0].getImm()) {
    1796          15 :   case VCCNZ:
    1797             :   case VCCZ: {
    1798          15 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1799             :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1800             :     assert(MRI.getRegClass(FalseReg) == RC);
    1801             : 
    1802          30 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1803          15 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1804             : 
    1805             :     // Limit to equal cost for branch vs. N v_cndmask_b32s.
    1806          15 :     return !RI.isSGPRClass(RC) && NumInsts <= 6;
    1807             :   }
    1808           7 :   case SCC_TRUE:
    1809             :   case SCC_FALSE: {
    1810             :     // FIXME: We could insert for VGPRs if we could replace the original compare
    1811             :     // with a vector one.
    1812           7 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1813             :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1814             :     assert(MRI.getRegClass(FalseReg) == RC);
    1815             : 
    1816          14 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1817             : 
    1818             :     // Multiples of 8 can do s_cselect_b64
    1819           7 :     if (NumInsts % 2 == 0)
    1820           3 :       NumInsts /= 2;
    1821             : 
    1822           7 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1823           7 :     return RI.isSGPRClass(RC);
    1824             :   }
    1825             :   default:
    1826             :     return false;
    1827             :   }
    1828             : }
    1829             : 
    1830          16 : void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
    1831             :                                MachineBasicBlock::iterator I, const DebugLoc &DL,
    1832             :                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
    1833             :                                unsigned TrueReg, unsigned FalseReg) const {
    1834          16 :   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
    1835          16 :   if (Pred == VCCZ || Pred == SCC_FALSE) {
    1836           0 :     Pred = static_cast<BranchPredicate>(-Pred);
    1837             :     std::swap(TrueReg, FalseReg);
    1838             :   }
    1839             : 
    1840          16 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1841             :   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
    1842             :   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
    1843             : 
    1844          16 :   if (DstSize == 32) {
    1845           9 :     unsigned SelOp = Pred == SCC_TRUE ?
    1846             :       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
    1847             : 
    1848             :     // Instruction's operands are backwards from what is expected.
    1849             :     MachineInstr *Select =
    1850          18 :       BuildMI(MBB, I, DL, get(SelOp), DstReg)
    1851           9 :       .addReg(FalseReg)
    1852           9 :       .addReg(TrueReg);
    1853             : 
    1854           9 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1855          10 :     return;
    1856             :   }
    1857             : 
    1858           7 :   if (DstSize == 64 && Pred == SCC_TRUE) {
    1859             :     MachineInstr *Select =
    1860           2 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
    1861           1 :       .addReg(FalseReg)
    1862           1 :       .addReg(TrueReg);
    1863             : 
    1864           1 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1865           1 :     return;
    1866             :   }
    1867             : 
    1868             :   static const int16_t Sub0_15[] = {
    1869             :     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    1870             :     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
    1871             :     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
    1872             :     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
    1873             :   };
    1874             : 
    1875             :   static const int16_t Sub0_15_64[] = {
    1876             :     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
    1877             :     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
    1878             :     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
    1879             :     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
    1880             :   };
    1881             : 
    1882             :   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
    1883             :   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
    1884             :   const int16_t *SubIndices = Sub0_15;
    1885           6 :   int NElts = DstSize / 32;
    1886             : 
    1887             :   // 64-bit select is only avaialble for SALU.
    1888           6 :   if (Pred == SCC_TRUE) {
    1889             :     SelOp = AMDGPU::S_CSELECT_B64;
    1890             :     EltRC = &AMDGPU::SGPR_64RegClass;
    1891             :     SubIndices = Sub0_15_64;
    1892             : 
    1893             :     assert(NElts % 2 == 0);
    1894           2 :     NElts /= 2;
    1895             :   }
    1896             : 
    1897             :   MachineInstrBuilder MIB = BuildMI(
    1898          12 :     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
    1899             : 
    1900           6 :   I = MIB->getIterator();
    1901             : 
    1902             :   SmallVector<unsigned, 8> Regs;
    1903          22 :   for (int Idx = 0; Idx != NElts; ++Idx) {
    1904          16 :     unsigned DstElt = MRI.createVirtualRegister(EltRC);
    1905          16 :     Regs.push_back(DstElt);
    1906             : 
    1907          16 :     unsigned SubIdx = SubIndices[Idx];
    1908             : 
    1909             :     MachineInstr *Select =
    1910          32 :       BuildMI(MBB, I, DL, get(SelOp), DstElt)
    1911          16 :       .addReg(FalseReg, 0, SubIdx)
    1912          16 :       .addReg(TrueReg, 0, SubIdx);
    1913          16 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1914             : 
    1915          16 :     MIB.addReg(DstElt)
    1916          16 :        .addImm(SubIdx);
    1917             :   }
    1918             : }
    1919             : 
    1920      964557 : bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
    1921     1929114 :   switch (MI.getOpcode()) {
    1922       25142 :   case AMDGPU::V_MOV_B32_e32:
    1923             :   case AMDGPU::V_MOV_B32_e64:
    1924             :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1925             :     // If there are additional implicit register operands, this may be used for
    1926             :     // register indexing so the source register operand isn't simply copied.
    1927       25142 :     unsigned NumOps = MI.getDesc().getNumOperands() +
    1928       25142 :       MI.getDesc().getNumImplicitUses();
    1929             : 
    1930       25142 :     return MI.getNumOperands() == NumOps;
    1931             :   }
    1932             :   case AMDGPU::S_MOV_B32:
    1933             :   case AMDGPU::S_MOV_B64:
    1934             :   case AMDGPU::COPY:
    1935             :     return true;
    1936      533556 :   default:
    1937      533556 :     return false;
    1938             :   }
    1939             : }
    1940             : 
    1941       86955 : unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
    1942             :     unsigned Kind) const {
    1943             :   switch(Kind) {
    1944             :   case PseudoSourceValue::Stack:
    1945             :   case PseudoSourceValue::FixedStack:
    1946             :     return AMDGPUAS::PRIVATE_ADDRESS;
    1947             :   case PseudoSourceValue::ConstantPool:
    1948             :   case PseudoSourceValue::GOT:
    1949             :   case PseudoSourceValue::JumpTable:
    1950             :   case PseudoSourceValue::GlobalValueCallEntry:
    1951             :   case PseudoSourceValue::ExternalSymbolCallEntry:
    1952             :   case PseudoSourceValue::TargetCustom:
    1953             :     return AMDGPUAS::CONSTANT_ADDRESS;
    1954             :   }
    1955             :   return AMDGPUAS::FLAT_ADDRESS;
    1956             : }
    1957             : 
    1958          38 : static void removeModOperands(MachineInstr &MI) {
    1959          38 :   unsigned Opc = MI.getOpcode();
    1960          38 :   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1961             :                                               AMDGPU::OpName::src0_modifiers);
    1962          38 :   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1963             :                                               AMDGPU::OpName::src1_modifiers);
    1964          38 :   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1965             :                                               AMDGPU::OpName::src2_modifiers);
    1966             : 
    1967          38 :   MI.RemoveOperand(Src2ModIdx);
    1968          38 :   MI.RemoveOperand(Src1ModIdx);
    1969          38 :   MI.RemoveOperand(Src0ModIdx);
    1970          38 : }
    1971             : 
    1972       57238 : bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
    1973             :                                 unsigned Reg, MachineRegisterInfo *MRI) const {
    1974       57238 :   if (!MRI->hasOneNonDBGUse(Reg))
    1975             :     return false;
    1976             : 
    1977       45780 :   switch (DefMI.getOpcode()) {
    1978             :   default:
    1979             :     return false;
    1980             :   case AMDGPU::S_MOV_B64:
    1981             :     // TODO: We could fold 64-bit immediates, but this get compilicated
    1982             :     // when there are sub-registers.
    1983             :     return false;
    1984             : 
    1985             :   case AMDGPU::V_MOV_B32_e32:
    1986             :   case AMDGPU::S_MOV_B32:
    1987             :     break;
    1988             :   }
    1989             : 
    1990       22313 :   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
    1991             :   assert(ImmOp);
    1992             :   // FIXME: We could handle FrameIndex values here.
    1993       22313 :   if (!ImmOp->isImm())
    1994             :     return false;
    1995             : 
    1996       21939 :   unsigned Opc = UseMI.getOpcode();
    1997       21939 :   if (Opc == AMDGPU::COPY) {
    1998        5813 :     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
    1999        5813 :     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
    2000        5813 :     UseMI.setDesc(get(NewOpc));
    2001       11626 :     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
    2002        5813 :     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
    2003        5813 :     return true;
    2004             :   }
    2005             : 
    2006       16126 :   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
    2007       15920 :       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
    2008             :     // Don't fold if we are using source or output modifiers. The new VOP2
    2009             :     // instructions don't have them.
    2010         241 :     if (hasAnyModifiersSet(UseMI))
    2011             :       return false;
    2012             : 
    2013             :     // If this is a free constant, there's no reason to do this.
    2014             :     // TODO: We could fold this here instead of letting SIFoldOperands do it
    2015             :     // later.
    2016         207 :     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
    2017             : 
    2018             :     // Any src operand can be used for the legality check.
    2019         207 :     if (isInlineConstant(UseMI, *Src0, *ImmOp))
    2020             :       return false;
    2021             : 
    2022             :     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
    2023         106 :     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
    2024         106 :     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
    2025             : 
    2026             :     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
    2027             :     // We should only expect these to be on src0 due to canonicalizations.
    2028         106 :     if (Src0->isReg() && Src0->getReg() == Reg) {
    2029           7 :       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
    2030           0 :         return false;
    2031             : 
    2032           7 :       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
    2033           0 :         return false;
    2034             : 
    2035             :       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
    2036             : 
    2037           7 :       const int64_t Imm = ImmOp->getImm();
    2038             : 
    2039             :       // FIXME: This would be a lot easier if we could return a new instruction
    2040             :       // instead of having to modify in place.
    2041             : 
    2042             :       // Remove these first since they are at the end.
    2043           7 :       UseMI.RemoveOperand(
    2044           7 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    2045           7 :       UseMI.RemoveOperand(
    2046           7 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    2047             : 
    2048           7 :       unsigned Src1Reg = Src1->getReg();
    2049             :       unsigned Src1SubReg = Src1->getSubReg();
    2050           7 :       Src0->setReg(Src1Reg);
    2051             :       Src0->setSubReg(Src1SubReg);
    2052             :       Src0->setIsKill(Src1->isKill());
    2053             : 
    2054           7 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    2055           7 :           Opc == AMDGPU::V_MAC_F16_e64)
    2056           0 :         UseMI.untieRegOperand(
    2057           0 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    2058             : 
    2059           7 :       Src1->ChangeToImmediate(Imm);
    2060             : 
    2061           7 :       removeModOperands(UseMI);
    2062          10 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
    2063             : 
    2064           7 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    2065           7 :       if (DeleteDef)
    2066           0 :         DefMI.eraseFromParent();
    2067             : 
    2068           7 :       return true;
    2069             :     }
    2070             : 
    2071             :     // Added part is the constant: Use v_madak_{f16, f32}.
    2072          99 :     if (Src2->isReg() && Src2->getReg() == Reg) {
    2073             :       // Not allowed to use constant bus for another operand.
    2074             :       // We can however allow an inline immediate as src0.
    2075             :       bool Src0Inlined = false;
    2076          44 :       if (Src0->isReg()) {
    2077             :         // Try to inline constant if possible.
    2078             :         // If the Def moves immediate and the use is single
    2079             :         // We are saving VGPR here.
    2080          44 :         MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
    2081          46 :         if (Def && Def->isMoveImmediate() &&
    2082          53 :           isInlineConstant(Def->getOperand(1)) &&
    2083           3 :           MRI->hasOneUse(Src0->getReg())) {
    2084           3 :           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
    2085             :           Src0Inlined = true;
    2086          43 :         } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
    2087          42 :             RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
    2088          79 :             (RI.isVirtualRegister(Src0->getReg()) &&
    2089          39 :             RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
    2090           7 :           return false;
    2091             :           // VGPR is okay as Src0 - fallthrough
    2092             :       }
    2093             : 
    2094          37 :       if (Src1->isReg() && !Src0Inlined ) {
    2095             :         // We have one slot for inlinable constant so far - try to fill it
    2096          34 :         MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
    2097          49 :         if (Def && Def->isMoveImmediate() &&
    2098          46 :             isInlineConstant(Def->getOperand(1)) &&
    2099          60 :             MRI->hasOneUse(Src1->getReg()) &&
    2100          12 :             commuteInstruction(UseMI)) {
    2101          12 :             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
    2102          23 :         } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
    2103          22 :             RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
    2104          42 :             (RI.isVirtualRegister(Src1->getReg()) &&
    2105          21 :             RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
    2106           6 :           return false;
    2107             :           // VGPR is okay as Src1 - fallthrough
    2108             :       }
    2109             : 
    2110          31 :       const int64_t Imm = ImmOp->getImm();
    2111             : 
    2112             :       // FIXME: This would be a lot easier if we could return a new instruction
    2113             :       // instead of having to modify in place.
    2114             : 
    2115             :       // Remove these first since they are at the end.
    2116          31 :       UseMI.RemoveOperand(
    2117          31 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    2118          31 :       UseMI.RemoveOperand(
    2119          31 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    2120             : 
    2121          31 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    2122          31 :           Opc == AMDGPU::V_MAC_F16_e64)
    2123          28 :         UseMI.untieRegOperand(
    2124          28 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    2125             : 
    2126             :       // ChangingToImmediate adds Src2 back to the instruction.
    2127          31 :       Src2->ChangeToImmediate(Imm);
    2128             : 
    2129             :       // These come before src2.
    2130          31 :       removeModOperands(UseMI);
    2131          35 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
    2132             : 
    2133          31 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    2134          31 :       if (DeleteDef)
    2135           0 :         DefMI.eraseFromParent();
    2136             : 
    2137          31 :       return true;
    2138             :     }
    2139             :   }
    2140             : 
    2141             :   return false;
    2142             : }
    2143             : 
    2144             : static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
    2145             :                                 int WidthB, int OffsetB) {
    2146       23856 :   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
    2147       23856 :   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
    2148       23856 :   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
    2149       23856 :   return LowOffset + LowWidth <= HighOffset;
    2150             : }
    2151             : 
    2152      874733 : bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
    2153             :                                                MachineInstr &MIb) const {
    2154             :   unsigned BaseReg0, BaseReg1;
    2155             :   int64_t Offset0, Offset1;
    2156             : 
    2157      964574 :   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
    2158       89841 :       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
    2159             : 
    2160      148347 :     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
    2161             :       // FIXME: Handle ds_read2 / ds_write2.
    2162       23814 :       return false;
    2163             :     }
    2164       61403 :     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
    2165       61403 :     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
    2166       85259 :     if (BaseReg0 == BaseReg1 &&
    2167       23856 :         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
    2168       18845 :       return true;
    2169             :     }
    2170             :   }
    2171             : 
    2172             :   return false;
    2173             : }
    2174             : 
    2175      959878 : bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
    2176             :                                                   MachineInstr &MIb,
    2177             :                                                   AliasAnalysis *AA) const {
    2178             :   assert((MIa.mayLoad() || MIa.mayStore()) &&
    2179             :          "MIa must load from or modify a memory location");
    2180             :   assert((MIb.mayLoad() || MIb.mayStore()) &&
    2181             :          "MIb must load from or modify a memory location");
    2182             : 
    2183      959878 :   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
    2184           0 :     return false;
    2185             : 
    2186             :   // XXX - Can we relax this between address spaces?
    2187      959878 :   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
    2188         208 :     return false;
    2189             : 
    2190      964990 :   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
    2191        2622 :     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
    2192        2622 :     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
    2193        4461 :     if (MMOa->getValue() && MMOb->getValue()) {
    2194        1666 :       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
    2195        1666 :       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
    2196        1666 :       if (!AA->alias(LocA, LocB))
    2197        1172 :         return true;
    2198             :     }
    2199             :   }
    2200             : 
    2201             :   // TODO: Should we check the address space from the MachineMemOperand? That
    2202             :   // would allow us to distinguish objects we know don't alias based on the
    2203             :   // underlying address space, even if it was lowered to a different one,
    2204             :   // e.g. private accesses lowered to use MUBUF instructions on a scratch
    2205             :   // buffer.
    2206      958498 :   if (isDS(MIa)) {
    2207      118542 :     if (isDS(MIb))
    2208       64311 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2209             : 
    2210       54231 :     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
    2211             :   }
    2212             : 
    2213      839956 :   if (isMUBUF(MIa) || isMTBUF(MIa)) {
    2214      789645 :     if (isMUBUF(MIb) || isMTBUF(MIb))
    2215      777403 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2216             : 
    2217       12242 :     return !isFLAT(MIb) && !isSMRD(MIb);
    2218             :   }
    2219             : 
    2220       50311 :   if (isSMRD(MIa)) {
    2221        3689 :     if (isSMRD(MIb))
    2222           0 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2223             : 
    2224        3689 :     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
    2225             :   }
    2226             : 
    2227       46622 :   if (isFLAT(MIa)) {
    2228       46563 :     if (isFLAT(MIb))
    2229       33019 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2230             : 
    2231             :     return false;
    2232             :   }
    2233             : 
    2234             :   return false;
    2235             : }
    2236             : 
    2237         768 : static int64_t getFoldableImm(const MachineOperand* MO) {
    2238         768 :   if (!MO->isReg())
    2239             :     return false;
    2240         766 :   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
    2241         766 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2242         766 :   auto Def = MRI.getUniqueVRegDef(MO->getReg());
    2243         766 :   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
    2244          17 :       Def->getOperand(1).isImm())
    2245          17 :     return Def->getOperand(1).getImm();
    2246             :   return AMDGPU::NoRegister;
    2247             : }
    2248             : 
    2249         305 : MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
    2250             :                                                  MachineInstr &MI,
    2251             :                                                  LiveVariables *LV) const {
    2252         305 :   unsigned Opc = MI.getOpcode();
    2253             :   bool IsF16 = false;
    2254         305 :   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
    2255             : 
    2256         305 :   switch (Opc) {
    2257             :   default:
    2258             :     return nullptr;
    2259           0 :   case AMDGPU::V_MAC_F16_e64:
    2260             :     IsF16 = true;
    2261             :     LLVM_FALLTHROUGH;
    2262             :   case AMDGPU::V_MAC_F32_e64:
    2263             :   case AMDGPU::V_FMAC_F32_e64:
    2264             :     break;
    2265           6 :   case AMDGPU::V_MAC_F16_e32:
    2266             :     IsF16 = true;
    2267             :     LLVM_FALLTHROUGH;
    2268         291 :   case AMDGPU::V_MAC_F32_e32:
    2269             :   case AMDGPU::V_FMAC_F32_e32: {
    2270         291 :     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
    2271             :                                              AMDGPU::OpName::src0);
    2272         291 :     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
    2273         291 :     if (!Src0->isReg() && !Src0->isImm())
    2274             :       return nullptr;
    2275             : 
    2276         290 :     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
    2277             :       return nullptr;
    2278             : 
    2279             :     break;
    2280             :   }
    2281             :   }
    2282             : 
    2283         301 :   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    2284         301 :   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
    2285             :   const MachineOperand *Src0Mods =
    2286         301 :     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
    2287         301 :   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
    2288             :   const MachineOperand *Src1Mods =
    2289         301 :     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
    2290         301 :   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
    2291         301 :   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2292         301 :   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2293             : 
    2294         301 :   if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
    2295             :       // If we have an SGPR input, we will violate the constant bus restriction.
    2296         558 :       (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
    2297         265 :     if (auto Imm = getFoldableImm(Src2)) {
    2298          12 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2299          32 :                      get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
    2300             :                .add(*Dst)
    2301             :                .add(*Src0)
    2302             :                .add(*Src1)
    2303             :                .addImm(Imm);
    2304             :     }
    2305         253 :     if (auto Imm = getFoldableImm(Src1)) {
    2306           3 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2307           7 :                      get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2308             :                .add(*Dst)
    2309             :                .add(*Src0)
    2310             :                .addImm(Imm)
    2311             :                .add(*Src2);
    2312             :     }
    2313         250 :     if (auto Imm = getFoldableImm(Src0)) {
    2314           2 :       if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
    2315             :                            AMDGPU::OpName::src0), Src1))
    2316           2 :         return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2317           4 :                        get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2318             :                  .add(*Dst)
    2319             :                  .add(*Src1)
    2320             :                  .addImm(Imm)
    2321             :                  .add(*Src2);
    2322             :     }
    2323             :   }
    2324             : 
    2325             :   assert((!IsFMA || !IsF16) && "fmac only expected with f32");
    2326         284 :   unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
    2327             :     (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
    2328         568 :   return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
    2329             :       .add(*Dst)
    2330         284 :       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
    2331             :       .add(*Src0)
    2332         284 :       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
    2333             :       .add(*Src1)
    2334             :       .addImm(0) // Src mods
    2335             :       .add(*Src2)
    2336         284 :       .addImm(Clamp ? Clamp->getImm() : 0)
    2337         284 :       .addImm(Omod ? Omod->getImm() : 0);
    2338             : }
    2339             : 
    2340             : // It's not generally safe to move VALU instructions across these since it will
    2341             : // start using the register as a base index rather than directly.
    2342             : // XXX - Why isn't hasSideEffects sufficient for these?
    2343             : static bool changesVGPRIndexingMode(const MachineInstr &MI) {
    2344      533351 :   switch (MI.getOpcode()) {
    2345             :   case AMDGPU::S_SET_GPR_IDX_ON:
    2346             :   case AMDGPU::S_SET_GPR_IDX_MODE:
    2347             :   case AMDGPU::S_SET_GPR_IDX_OFF:
    2348             :     return true;
    2349             :   default:
    2350             :     return false;
    2351             :   }
    2352             : }
    2353             : 
    2354      576381 : bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
    2355             :                                        const MachineBasicBlock *MBB,
    2356             :                                        const MachineFunction &MF) const {
    2357             :   // XXX - Do we want the SP check in the base implementation?
    2358             : 
    2359             :   // Target-independent instructions do not have an implicit-use of EXEC, even
    2360             :   // when they operate on VGPRs. Treating EXEC modifications as scheduling
    2361             :   // boundaries prevents incorrect movements of such instructions.
    2362     1113214 :   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
    2363     1070358 :          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
    2364      533525 :          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
    2365      576381 :          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
    2366      576381 :          changesVGPRIndexingMode(MI);
    2367             : }
    2368             : 
    2369        3260 : bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
    2370        3260 :   unsigned Opcode = MI.getOpcode();
    2371             : 
    2372        3260 :   if (MI.mayStore() && isSMRD(MI))
    2373             :     return true; // scalar store or atomic
    2374             : 
    2375             :   // These instructions cause shader I/O that may cause hardware lockups
    2376             :   // when executed with an empty EXEC mask.
    2377             :   //
    2378             :   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
    2379             :   //       EXEC = 0, but checking for that case here seems not worth it
    2380             :   //       given the typical code patterns.
    2381        6520 :   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
    2382        3260 :       Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
    2383             :     return true;
    2384             : 
    2385        3248 :   if (MI.isInlineAsm())
    2386             :     return true; // conservative assumption
    2387             : 
    2388             :   // These are like SALU instructions in terms of effects, so it's questionable
    2389             :   // whether we should return true for those.
    2390             :   //
    2391             :   // However, executing them with EXEC = 0 causes them to operate on undefined
    2392             :   // data, which we avoid by returning true here.
    2393        3236 :   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
    2394          12 :     return true;
    2395             : 
    2396             :   return false;
    2397             : }
    2398             : 
    2399        5823 : bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
    2400        5823 :   switch (Imm.getBitWidth()) {
    2401         156 :   case 32:
    2402         312 :     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
    2403         312 :                                         ST.hasInv2PiInlineImm());
    2404        5514 :   case 64:
    2405       11028 :     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
    2406       11028 :                                         ST.hasInv2PiInlineImm());
    2407         153 :   case 16:
    2408         153 :     return ST.has16BitInsts() &&
    2409         153 :            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
    2410         153 :                                         ST.hasInv2PiInlineImm());
    2411           0 :   default:
    2412           0 :     llvm_unreachable("invalid bitwidth");
    2413             :   }
    2414             : }
    2415             : 
    2416     4760521 : bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
    2417             :                                    uint8_t OperandType) const {
    2418             :   if (!MO.isImm() ||
    2419     4760521 :       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
    2420             :       OperandType > AMDGPU::OPERAND_SRC_LAST)
    2421             :     return false;
    2422             : 
    2423             :   // MachineOperand provides no way to tell the true operand size, since it only
    2424             :   // records a 64-bit value. We need to know the size to determine if a 32-bit
    2425             :   // floating point immediate bit pattern is legal for an integer immediate. It
    2426             :   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
    2427             : 
    2428     4707374 :   int64_t Imm = MO.getImm();
    2429     4707374 :   switch (OperandType) {
    2430     4458799 :   case AMDGPU::OPERAND_REG_IMM_INT32:
    2431             :   case AMDGPU::OPERAND_REG_IMM_FP32:
    2432             :   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2433             :   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
    2434     4458799 :     int32_t Trunc = static_cast<int32_t>(Imm);
    2435     4458799 :     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
    2436             :   }
    2437       49180 :   case AMDGPU::OPERAND_REG_IMM_INT64:
    2438             :   case AMDGPU::OPERAND_REG_IMM_FP64:
    2439             :   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2440             :   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2441       49180 :     return AMDGPU::isInlinableLiteral64(MO.getImm(),
    2442       49180 :                                         ST.hasInv2PiInlineImm());
    2443             :   case AMDGPU::OPERAND_REG_IMM_INT16:
    2444             :   case AMDGPU::OPERAND_REG_IMM_FP16:
    2445             :   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2446             :   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2447      191705 :     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
    2448             :       // A few special case instructions have 16-bit operands on subtargets
    2449             :       // where 16-bit instructions are not legal.
    2450             :       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
    2451             :       // constants in these cases
    2452             :       int16_t Trunc = static_cast<int16_t>(Imm);
    2453      191582 :       return ST.has16BitInsts() &&
    2454      191580 :              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
    2455             :     }
    2456             : 
    2457             :     return false;
    2458             :   }
    2459        7690 :   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
    2460             :   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
    2461        7690 :     if (isUInt<16>(Imm)) {
    2462         925 :       int16_t Trunc = static_cast<int16_t>(Imm);
    2463         925 :       return ST.has16BitInsts() &&
    2464         925 :              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
    2465             :     }
    2466        6765 :     if (!(Imm & 0xffff)) {
    2467          25 :       return ST.has16BitInsts() &&
    2468          25 :              AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
    2469             :     }
    2470             :     uint32_t Trunc = static_cast<uint32_t>(Imm);
    2471        6740 :     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
    2472             :   }
    2473           0 :   default:
    2474           0 :     llvm_unreachable("invalid bitwidth");
    2475             :   }
    2476             : }
    2477             : 
    2478      640447 : bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
    2479             :                                         const MCOperandInfo &OpInfo) const {
    2480             :   switch (MO.getType()) {
    2481             :   case MachineOperand::MO_Register:
    2482             :     return false;
    2483      191414 :   case MachineOperand::MO_Immediate:
    2484      191414 :     return !isInlineConstant(MO, OpInfo);
    2485             :   case MachineOperand::MO_FrameIndex:
    2486             :   case MachineOperand::MO_MachineBasicBlock:
    2487             :   case MachineOperand::MO_ExternalSymbol:
    2488             :   case MachineOperand::MO_GlobalAddress:
    2489             :   case MachineOperand::MO_MCSymbol:
    2490             :     return true;
    2491           0 :   default:
    2492           0 :     llvm_unreachable("unexpected operand type");
    2493             :   }
    2494             : }
    2495             : 
    2496             : static bool compareMachineOp(const MachineOperand &Op0,
    2497             :                              const MachineOperand &Op1) {
    2498             :   if (Op0.getType() != Op1.getType())
    2499             :     return false;
    2500             : 
    2501             :   switch (Op0.getType()) {
    2502        5892 :   case MachineOperand::MO_Register:
    2503       17842 :     return Op0.getReg() == Op1.getReg();
    2504             :   case MachineOperand::MO_Immediate:
    2505             :     return Op0.getImm() == Op1.getImm();
    2506             :   default:
    2507             :     llvm_unreachable("Didn't expect to be comparing these operand types");
    2508             :   }
    2509             : }
    2510             : 
    2511       92762 : bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
    2512             :                                     const MachineOperand &MO) const {
    2513      185524 :   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
    2514             : 
    2515             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    2516             : 
    2517       92762 :   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
    2518             :     return true;
    2519             : 
    2520       92762 :   if (OpInfo.RegClass < 0)
    2521             :     return false;
    2522             : 
    2523      185262 :   if (MO.isImm() && isInlineConstant(MO, OpInfo))
    2524      128406 :     return RI.opCanUseInlineConstant(OpInfo.OperandType);
    2525             : 
    2526       57118 :   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
    2527             : }
    2528             : 
    2529      777103 : bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
    2530      777103 :   int Op32 = AMDGPU::getVOPe32(Opcode);
    2531      777103 :   if (Op32 == -1)
    2532             :     return false;
    2533             : 
    2534      134545 :   return pseudoToMCOpcode(Op32) != -1;
    2535             : }
    2536             : 
    2537           0 : bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
    2538             :   // The src0_modifier operand is present on all instructions
    2539             :   // that have modifiers.
    2540             : 
    2541           0 :   return AMDGPU::getNamedOperandIdx(Opcode,
    2542           0 :                                     AMDGPU::OpName::src0_modifiers) != -1;
    2543             : }
    2544             : 
    2545      219769 : bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
    2546             :                                   unsigned OpName) const {
    2547             :   const MachineOperand *Mods = getNamedOperand(MI, OpName);
    2548      219769 :   return Mods && Mods->getImm();
    2549             : }
    2550             : 
    2551         241 : bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
    2552         473 :   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
    2553         461 :          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
    2554         440 :          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
    2555         661 :          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
    2556         209 :          hasModifiersSet(MI, AMDGPU::OpName::omod);
    2557             : }
    2558             : 
    2559       88061 : bool SIInstrInfo::canShrink(const MachineInstr &MI,
    2560             :                             const MachineRegisterInfo &MRI) const {
    2561             :   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
    2562             :   // Can't shrink instruction with three operands.
    2563             :   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
    2564             :   // a special case for it.  It can only be shrunk if the third operand
    2565             :   // is vcc.  We should handle this the same way we handle vopc, by addding
    2566             :   // a register allocation hint pre-regalloc and then do the shrinking
    2567             :   // post-regalloc.
    2568       88061 :   if (Src2) {
    2569       50100 :     switch (MI.getOpcode()) {
    2570             :       default: return false;
    2571             : 
    2572             :       case AMDGPU::V_ADDC_U32_e64:
    2573             :       case AMDGPU::V_SUBB_U32_e64:
    2574             :       case AMDGPU::V_SUBBREV_U32_e64: {
    2575             :         const MachineOperand *Src1
    2576             :           = getNamedOperand(MI, AMDGPU::OpName::src1);
    2577       14517 :         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
    2578        3679 :           return false;
    2579             :         // Additional verification is needed for sdst/src2.
    2580             :         return true;
    2581             :       }
    2582             :       case AMDGPU::V_MAC_F32_e64:
    2583             :       case AMDGPU::V_MAC_F16_e64:
    2584             :       case AMDGPU::V_FMAC_F32_e64:
    2585        3272 :         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
    2586        1636 :             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
    2587           0 :           return false;
    2588             :         break;
    2589             : 
    2590             :       case AMDGPU::V_CNDMASK_B32_e64:
    2591             :         break;
    2592             :     }
    2593             :   }
    2594             : 
    2595             :   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
    2596      121738 :   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
    2597       48194 :                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
    2598       20647 :     return false;
    2599             : 
    2600             :   // We don't need to check src0, all input types are legal, so just make sure
    2601             :   // src0 isn't using any modifiers.
    2602       52897 :   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
    2603             :     return false;
    2604             : 
    2605             :   // Check output modifiers
    2606      103864 :   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
    2607       51914 :          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
    2608             : }
    2609             : 
    2610             : // Set VCC operand with all flags from \p Orig, except for setting it as
    2611             : // implicit.
    2612        7406 : static void copyFlagsToImplicitVCC(MachineInstr &MI,
    2613             :                                    const MachineOperand &Orig) {
    2614             : 
    2615       12554 :   for (MachineOperand &Use : MI.implicit_operands()) {
    2616       12554 :     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
    2617             :       Use.setIsUndef(Orig.isUndef());
    2618             :       Use.setIsKill(Orig.isKill());
    2619        7406 :       return;
    2620             :     }
    2621             :   }
    2622             : }
    2623             : 
    2624       42959 : MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
    2625             :                                            unsigned Op32) const {
    2626       42959 :   MachineBasicBlock *MBB = MI.getParent();;
    2627             :   MachineInstrBuilder Inst32 =
    2628       85918 :     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
    2629             : 
    2630             :   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
    2631             :   // For VOPC instructions, this is replaced by an implicit def of vcc.
    2632       42959 :   int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
    2633       42959 :   if (Op32DstIdx != -1) {
    2634             :     // dst
    2635       40730 :     Inst32.add(MI.getOperand(0));
    2636             :   } else {
    2637             :     assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
    2638             :            "Unexpected case");
    2639             :   }
    2640             : 
    2641       42959 :   Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
    2642             : 
    2643       42959 :   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
    2644       42959 :   if (Src1)
    2645             :     Inst32.add(*Src1);
    2646             : 
    2647       42959 :   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
    2648             : 
    2649       42959 :   if (Src2) {
    2650        8629 :     int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
    2651        8629 :     if (Op32Src2Idx != -1) {
    2652             :       Inst32.add(*Src2);
    2653             :     } else {
    2654             :       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
    2655             :       // replaced with an implicit read of vcc. This was already added
    2656             :       // during the initial BuildMI, so find it to preserve the flags.
    2657        7406 :       copyFlagsToImplicitVCC(*Inst32, *Src2);
    2658             :     }
    2659             :   }
    2660             : 
    2661       42959 :   return Inst32;
    2662             : }
    2663             : 
    2664     8785463 : bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
    2665             :                                   const MachineOperand &MO,
    2666             :                                   const MCOperandInfo &OpInfo) const {
    2667             :   // Literal constants use the constant bus.
    2668             :   //if (isLiteralConstantLike(MO, OpInfo))
    2669             :   // return true;
    2670     8785463 :   if (MO.isImm())
    2671     2234167 :     return !isInlineConstant(MO, OpInfo);
    2672             : 
    2673     6551296 :   if (!MO.isReg())
    2674             :     return true; // Misc other operands like FrameIndex
    2675             : 
    2676     6541148 :   if (!MO.isUse())
    2677             :     return false;
    2678             : 
    2679    12703584 :   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    2680     3182316 :     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
    2681             : 
    2682             :   // FLAT_SCR is just an SGPR pair.
    2683     3169476 :   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
    2684             :     return true;
    2685             : 
    2686             :   // EXEC register uses the constant bus.
    2687     3169476 :   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
    2688             :     return true;
    2689             : 
    2690             :   // SGPRs use the constant bus
    2691     3169476 :   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
    2692      788109 :           (!MO.isImplicit() &&
    2693     5012663 :            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
    2694     2166148 :             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
    2695             : }
    2696             : 
    2697     4239473 : static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
    2698     9203088 :   for (const MachineOperand &MO : MI.implicit_operands()) {
    2699             :     // We only care about reads.
    2700     5029705 :     if (MO.isDef())
    2701             :       continue;
    2702             : 
    2703     4735708 :     switch (MO.getReg()) {
    2704             :     case AMDGPU::VCC:
    2705             :     case AMDGPU::M0:
    2706             :     case AMDGPU::FLAT_SCR:
    2707             :       return MO.getReg();
    2708             : 
    2709             :     default:
    2710             :       break;
    2711             :     }
    2712             :   }
    2713             : 
    2714             :   return AMDGPU::NoRegister;
    2715             : }
    2716             : 
    2717    11521906 : static bool shouldReadExec(const MachineInstr &MI) {
    2718    11521906 :   if (SIInstrInfo::isVALU(MI)) {
    2719     4223992 :     switch (MI.getOpcode()) {
    2720             :     case AMDGPU::V_READLANE_B32:
    2721             :     case AMDGPU::V_READLANE_B32_si:
    2722             :     case AMDGPU::V_READLANE_B32_vi:
    2723             :     case AMDGPU::V_WRITELANE_B32:
    2724             :     case AMDGPU::V_WRITELANE_B32_si:
    2725             :     case AMDGPU::V_WRITELANE_B32_vi:
    2726             :       return false;
    2727             :     }
    2728             : 
    2729     4191418 :     return true;
    2730             :   }
    2731             : 
    2732     7297914 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
    2733    10775007 :       SIInstrInfo::isSALU(MI) ||
    2734             :       SIInstrInfo::isSMRD(MI))
    2735     5031784 :     return false;
    2736             : 
    2737             :   return true;
    2738             : }
    2739             : 
    2740        2970 : static bool isSubRegOf(const SIRegisterInfo &TRI,
    2741             :                        const MachineOperand &SuperVec,
    2742             :                        const MachineOperand &SubReg) {
    2743        5940 :   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
    2744        1906 :     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
    2745             : 
    2746        1064 :   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
    2747        1064 :          SubReg.getReg() == SuperVec.getReg();
    2748             : }
    2749             : 
    2750    17022433 : bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
    2751             :                                     StringRef &ErrInfo) const {
    2752    17022433 :   uint16_t Opcode = MI.getOpcode();
    2753    17022433 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
    2754             :     return true;
    2755             : 
    2756    11521906 :   const MachineFunction *MF = MI.getParent()->getParent();
    2757    11521906 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2758             : 
    2759    11521906 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
    2760    11521906 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
    2761    11521906 :   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
    2762             : 
    2763             :   // Make sure the number of operands is correct.
    2764    11521906 :   const MCInstrDesc &Desc = get(Opcode);
    2765    34515469 :   if (!Desc.isVariadic() &&
    2766    22943314 :       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
    2767           0 :     ErrInfo = "Instruction has wrong number of operands.";
    2768           0 :     return false;
    2769             :   }
    2770             : 
    2771    11521906 :   if (MI.isInlineAsm()) {
    2772             :     // Verify register classes for inlineasm constraints.
    2773           0 :     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
    2774           0 :          I != E; ++I) {
    2775           0 :       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
    2776           0 :       if (!RC)
    2777             :         continue;
    2778             : 
    2779           0 :       const MachineOperand &Op = MI.getOperand(I);
    2780           0 :       if (!Op.isReg())
    2781             :         continue;
    2782             : 
    2783           0 :       unsigned Reg = Op.getReg();
    2784           0 :       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
    2785           0 :         ErrInfo = "inlineasm operand has incorrect register class.";
    2786           0 :         return false;
    2787             :       }
    2788             :     }
    2789             : 
    2790             :     return true;
    2791             :   }
    2792             : 
    2793             :   // Make sure the register classes are correct.
    2794    52252741 :   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
    2795    81461670 :     if (MI.getOperand(i).isFPImm()) {
    2796           0 :       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
    2797             :                 "all fp values to integers.";
    2798           0 :       return false;
    2799             :     }
    2800             : 
    2801    40730835 :     int RegClass = Desc.OpInfo[i].RegClass;
    2802             : 
    2803    40730835 :     switch (Desc.OpInfo[i].OperandType) {
    2804    16380505 :     case MCOI::OPERAND_REGISTER:
    2805    16380505 :       if (MI.getOperand(i).isImm()) {
    2806           0 :         ErrInfo = "Illegal immediate value for operand.";
    2807           0 :         return false;
    2808             :       }
    2809             :       break;
    2810             :     case AMDGPU::OPERAND_REG_IMM_INT32:
    2811             :     case AMDGPU::OPERAND_REG_IMM_FP32:
    2812             :       break;
    2813     5529387 :     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2814             :     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
    2815             :     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2816             :     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2817             :     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2818             :     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2819             :       const MachineOperand &MO = MI.getOperand(i);
    2820     7635630 :       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
    2821           0 :         ErrInfo = "Illegal immediate value for operand.";
    2822           0 :         return false;
    2823             :       }
    2824             :       break;
    2825             :     }
    2826    11799723 :     case MCOI::OPERAND_IMMEDIATE:
    2827             :     case AMDGPU::OPERAND_KIMM32:
    2828             :       // Check if this operand is an immediate.
    2829             :       // FrameIndex operands will be replaced by immediates, so they are
    2830             :       // allowed.
    2831    11799723 :       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
    2832           0 :         ErrInfo = "Expected immediate, but got non-immediate";
    2833           0 :         return false;
    2834             :       }
    2835             :       LLVM_FALLTHROUGH;
    2836             :     default:
    2837             :       continue;
    2838             :     }
    2839             : 
    2840    53920256 :     if (!MI.getOperand(i).isReg())
    2841             :       continue;
    2842             : 
    2843    21968251 :     if (RegClass != -1) {
    2844    21968251 :       unsigned Reg = MI.getOperand(i).getReg();
    2845    21968251 :       if (Reg == AMDGPU::NoRegister ||
    2846             :           TargetRegisterInfo::isVirtualRegister(Reg))
    2847             :         continue;
    2848             : 
    2849    11927407 :       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
    2850    11927407 :       if (!RC->contains(Reg)) {
    2851           0 :         ErrInfo = "Operand has incorrect register class.";
    2852           0 :         return false;
    2853             :       }
    2854             :     }
    2855             :   }
    2856             : 
    2857             :   // Verify SDWA
    2858    11521906 :   if (isSDWA(MI)) {
    2859       58235 :     if (!ST.hasSDWA()) {
    2860           0 :       ErrInfo = "SDWA is not supported on this target";
    2861           0 :       return false;
    2862             :     }
    2863             : 
    2864       58235 :     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
    2865             : 
    2866       58235 :     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
    2867             : 
    2868      291175 :     for (int OpIdx: OpIndicies) {
    2869      232940 :       if (OpIdx == -1)
    2870             :         continue;
    2871      167034 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2872             : 
    2873      167034 :       if (!ST.hasSDWAScalar()) {
    2874             :         // Only VGPRS on VI
    2875      130685 :         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
    2876           0 :           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
    2877           0 :           return false;
    2878             :         }
    2879             :       } else {
    2880             :         // No immediates on GFX9
    2881       36349 :         if (!MO.isReg()) {
    2882           0 :           ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
    2883           0 :           return false;
    2884             :         }
    2885             :       }
    2886             :     }
    2887             : 
    2888       58235 :     if (!ST.hasSDWAOmod()) {
    2889             :       // No omod allowed on VI
    2890             :       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2891       45943 :       if (OMod != nullptr &&
    2892       14742 :         (!OMod->isImm() || OMod->getImm() != 0)) {
    2893           0 :         ErrInfo = "OMod not allowed in SDWA instructions on VI";
    2894           0 :         return false;
    2895             :       }
    2896             :     }
    2897             : 
    2898       58235 :     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
    2899       58235 :     if (isVOPC(BasicOpcode)) {
    2900          63 :       if (!ST.hasSDWASdst() && DstIdx != -1) {
    2901             :         // Only vcc allowed as dst on VI for VOPC
    2902           0 :         const MachineOperand &Dst = MI.getOperand(DstIdx);
    2903           0 :         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
    2904           0 :           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
    2905           0 :           return false;
    2906             :         }
    2907          63 :       } else if (!ST.hasSDWAOutModsVOPC()) {
    2908             :         // No clamp allowed on GFX9 for VOPC
    2909             :         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2910          50 :         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
    2911           0 :           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
    2912           0 :           return false;
    2913             :         }
    2914             : 
    2915             :         // No omod allowed on GFX9 for VOPC
    2916             :         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2917          50 :         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
    2918           0 :           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
    2919           0 :           return false;
    2920             :         }
    2921             :       }
    2922             :     }
    2923             : 
    2924             :     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
    2925       58235 :     if (DstUnused && DstUnused->isImm() &&
    2926       58172 :         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
    2927         236 :       const MachineOperand &Dst = MI.getOperand(DstIdx);
    2928         236 :       if (!Dst.isReg() || !Dst.isTied()) {
    2929           0 :         ErrInfo = "Dst register should have tied register";
    2930           0 :         return false;
    2931             :       }
    2932             : 
    2933             :       const MachineOperand &TiedMO =
    2934         236 :           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
    2935         236 :       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
    2936           0 :         ErrInfo =
    2937             :             "Dst register should be tied to implicit use of preserved register";
    2938           0 :         return false;
    2939         472 :       } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
    2940         144 :                  Dst.getReg() != TiedMO.getReg()) {
    2941           0 :         ErrInfo = "Dst register should use same physical register as preserved";
    2942           0 :         return false;
    2943             :       }
    2944             :     }
    2945             :   }
    2946             : 
    2947             :   // Verify VOP*. Ignore multiple sgpr operands on writelane.
    2948    11521906 :   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
    2949    11521906 :       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
    2950             :     // Only look at the true operands. Only a real operand can use the constant
    2951             :     // bus, and we don't want to check pseudo-operands like the source modifier
    2952             :     // flags.
    2953     4166938 :     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
    2954             : 
    2955             :     unsigned ConstantBusCount = 0;
    2956             :     unsigned LiteralCount = 0;
    2957             : 
    2958     4166938 :     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
    2959             :       ++ConstantBusCount;
    2960             : 
    2961     4166938 :     unsigned SGPRUsed = findImplicitSGPRRead(MI);
    2962     4166938 :     if (SGPRUsed != AMDGPU::NoRegister)
    2963       65712 :       ++ConstantBusCount;
    2964             : 
    2965    12206927 :     for (int OpIdx : OpIndices) {
    2966    11338099 :       if (OpIdx == -1)
    2967             :         break;
    2968     8039989 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2969     8039989 :       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
    2970     1834096 :         if (MO.isReg()) {
    2971     1669597 :           if (MO.getReg() != SGPRUsed)
    2972     1651032 :             ++ConstantBusCount;
    2973             :           SGPRUsed = MO.getReg();
    2974             :         } else {
    2975      164499 :           ++ConstantBusCount;
    2976      164499 :           ++LiteralCount;
    2977             :         }
    2978             :       }
    2979             :     }
    2980     4166938 :     if (ConstantBusCount > 1) {
    2981           0 :       ErrInfo = "VOP* instruction uses the constant bus more than once";
    2982           0 :       return false;
    2983             :     }
    2984             : 
    2985     4166938 :     if (isVOP3(MI) && LiteralCount) {
    2986           0 :       ErrInfo = "VOP3 instruction uses literal";
    2987           0 :       return false;
    2988             :     }
    2989             :   }
    2990             : 
    2991             :   // Verify misc. restrictions on specific instructions.
    2992    23043812 :   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
    2993             :       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
    2994       12467 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    2995       12467 :     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
    2996       12467 :     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
    2997       12467 :     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
    2998       11950 :       if (!compareMachineOp(Src0, Src1) &&
    2999             :           !compareMachineOp(Src0, Src2)) {
    3000           0 :         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
    3001           0 :         return false;
    3002             :       }
    3003             :     }
    3004             :   }
    3005             : 
    3006    11521906 :   if (isSOPK(MI)) {
    3007       13823 :     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
    3008       13823 :     if (sopkIsZext(MI)) {
    3009         792 :       if (!isUInt<16>(Imm)) {
    3010           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    3011           0 :         return false;
    3012             :       }
    3013             :     } else {
    3014       13031 :       if (!isInt<16>(Imm)) {
    3015           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    3016           0 :         return false;
    3017             :       }
    3018             :     }
    3019             :   }
    3020             : 
    3021    11519707 :   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
    3022    11519707 :       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
    3023    23040842 :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    3024             :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
    3025        2970 :     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    3026             :                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
    3027             : 
    3028        2970 :     const unsigned StaticNumOps = Desc.getNumOperands() +
    3029        2970 :       Desc.getNumImplicitUses();
    3030        2970 :     const unsigned NumImplicitOps = IsDst ? 2 : 1;
    3031             : 
    3032             :     // Allow additional implicit operands. This allows a fixup done by the post
    3033             :     // RA scheduler where the main implicit operand is killed and implicit-defs
    3034             :     // are added for sub-registers that remain live after this instruction.
    3035        2970 :     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
    3036           0 :       ErrInfo = "missing implicit register operands";
    3037           0 :       return false;
    3038             :     }
    3039             : 
    3040             :     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    3041        2970 :     if (IsDst) {
    3042         771 :       if (!Dst->isUse()) {
    3043           0 :         ErrInfo = "v_movreld_b32 vdst should be a use operand";
    3044           0 :         return false;
    3045             :       }
    3046             : 
    3047             :       unsigned UseOpIdx;
    3048         771 :       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
    3049         771 :           UseOpIdx != StaticNumOps + 1) {
    3050           0 :         ErrInfo = "movrel implicit operands should be tied";
    3051           0 :         return false;
    3052             :       }
    3053             :     }
    3054             : 
    3055        2970 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3056             :     const MachineOperand &ImpUse
    3057        2970 :       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
    3058        5940 :     if (!ImpUse.isReg() || !ImpUse.isUse() ||
    3059        5169 :         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
    3060           0 :       ErrInfo = "src0 should be subreg of implicit vector use";
    3061           0 :       return false;
    3062             :     }
    3063             :   }
    3064             : 
    3065             :   // Make sure we aren't losing exec uses in the td files. This mostly requires
    3066             :   // being careful when using let Uses to try to add other use registers.
    3067    11521906 :   if (shouldReadExec(MI)) {
    3068     6457548 :     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
    3069           0 :       ErrInfo = "VALU instruction does not implicitly read exec mask";
    3070           0 :       return false;
    3071             :     }
    3072             :   }
    3073             : 
    3074    11521906 :   if (isSMRD(MI)) {
    3075     1210963 :     if (MI.mayStore()) {
    3076             :       // The register offset form of scalar stores may only use m0 as the
    3077             :       // soffset register.
    3078             :       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
    3079         363 :       if (Soff && Soff->getReg() != AMDGPU::M0) {
    3080           0 :         ErrInfo = "scalar stores must use m0 as offset register";
    3081           0 :         return false;
    3082             :       }
    3083             :     }
    3084             :   }
    3085             : 
    3086    11521906 :   if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
    3087             :     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    3088      393535 :     if (Offset->getImm() != 0) {
    3089           0 :       ErrInfo = "subtarget does not support offsets in flat instructions";
    3090           0 :       return false;
    3091             :     }
    3092             :   }
    3093             : 
    3094             :   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
    3095    11521906 :   if (DppCt) {
    3096             :     using namespace AMDGPU::DPP;
    3097             : 
    3098        7256 :     unsigned DC = DppCt->getImm();
    3099        7256 :     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
    3100        7256 :         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
    3101        7256 :         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
    3102        7256 :         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
    3103        7256 :         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
    3104        7256 :         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
    3105           0 :       ErrInfo = "Invalid dpp_ctrl value";
    3106           0 :       return false;
    3107             :     }
    3108             :   }
    3109             : 
    3110             :   return true;
    3111             : }
    3112             : 
    3113       95107 : unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
    3114      190214 :   switch (MI.getOpcode()) {
    3115             :   default: return AMDGPU::INSTRUCTION_LIST_END;
    3116       19533 :   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
    3117       44013 :   case AMDGPU::COPY: return AMDGPU::COPY;
    3118         472 :   case AMDGPU::PHI: return AMDGPU::PHI;
    3119           7 :   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
    3120           4 :   case AMDGPU::WQM: return AMDGPU::WQM;
    3121         266 :   case AMDGPU::WWM: return AMDGPU::WWM;
    3122          25 :   case AMDGPU::S_MOV_B32:
    3123          50 :     return MI.getOperand(1).isReg() ?
    3124             :            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
    3125        1948 :   case AMDGPU::S_ADD_I32:
    3126        1948 :     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
    3127         241 :   case AMDGPU::S_ADDC_U32:
    3128         241 :     return AMDGPU::V_ADDC_U32_e32;
    3129         801 :   case AMDGPU::S_SUB_I32:
    3130         801 :     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
    3131             :     // FIXME: These are not consistently handled, and selected when the carry is
    3132             :     // used.
    3133         121 :   case AMDGPU::S_ADD_U32:
    3134         121 :     return AMDGPU::V_ADD_I32_e32;
    3135           0 :   case AMDGPU::S_SUB_U32:
    3136           0 :     return AMDGPU::V_SUB_I32_e32;
    3137           0 :   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
    3138        1217 :   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
    3139        3413 :   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
    3140        1964 :   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
    3141         237 :   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
    3142          24 :   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
    3143           0 :   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
    3144          29 :   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
    3145           3 :   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
    3146        2687 :   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
    3147         175 :   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
    3148        1308 :   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
    3149         144 :   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
    3150        3402 :   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
    3151         109 :   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
    3152         588 :   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
    3153         985 :   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
    3154        1303 :   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
    3155        1989 :   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
    3156           0 :   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
    3157          15 :   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
    3158          13 :   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
    3159          18 :   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
    3160           0 :   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
    3161           0 :   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
    3162          14 :   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
    3163           4 :   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
    3164           5 :   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
    3165           0 :   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
    3166          17 :   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
    3167          26 :   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
    3168           2 :   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
    3169           3 :   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
    3170           2 :   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
    3171           0 :   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
    3172           1 :   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
    3173           1 :   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
    3174         128 :   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
    3175          34 :   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
    3176         158 :   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
    3177           2 :   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
    3178           0 :   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
    3179          75 :   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
    3180             :   }
    3181             : }
    3182             : 
    3183     1908046 : const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
    3184             :                                                       unsigned OpNo) const {
    3185     1908046 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    3186     3816092 :   const MCInstrDesc &Desc = get(MI.getOpcode());
    3187     1908046 :   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
    3188     1473048 :       Desc.OpInfo[OpNo].RegClass == -1) {
    3189     1595208 :     unsigned Reg = MI.getOperand(OpNo).getReg();
    3190             : 
    3191      797604 :     if (TargetRegisterInfo::isVirtualRegister(Reg))
    3192      423859 :       return MRI.getRegClass(Reg);
    3193      373745 :     return RI.getPhysRegClass(Reg);
    3194             :   }
    3195             : 
    3196     1110442 :   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
    3197     2220884 :   return RI.getRegClass(RCID);
    3198             : }
    3199             : 
    3200      117447 : bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
    3201      117447 :   switch (MI.getOpcode()) {
    3202       75348 :   case AMDGPU::COPY:
    3203             :   case AMDGPU::REG_SEQUENCE:
    3204             :   case AMDGPU::PHI:
    3205             :   case AMDGPU::INSERT_SUBREG:
    3206       75348 :     return RI.hasVGPRs(getOpRegClass(MI, 0));
    3207       42099 :   default:
    3208       42099 :     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
    3209             :   }
    3210             : }
    3211             : 
    3212       21885 : void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
    3213             :   MachineBasicBlock::iterator I = MI;
    3214       21885 :   MachineBasicBlock *MBB = MI.getParent();
    3215       21885 :   MachineOperand &MO = MI.getOperand(OpIdx);
    3216       21885 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    3217       43770 :   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
    3218       21885 :   const TargetRegisterClass *RC = RI.getRegClass(RCID);
    3219             :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
    3220       21885 :   if (MO.isReg())
    3221             :     Opcode = AMDGPU::COPY;
    3222           0 :   else if (RI.isSGPRClass(RC))
    3223             :     Opcode = AMDGPU::S_MOV_B32;
    3224             : 
    3225       21885 :   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
    3226       21885 :   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
    3227             :     VRC = &AMDGPU::VReg_64RegClass;
    3228             :   else
    3229             :     VRC = &AMDGPU::VGPR_32RegClass;
    3230             : 
    3231       21885 :   unsigned Reg = MRI.createVirtualRegister(VRC);
    3232             :   DebugLoc DL = MBB->findDebugLoc(I);
    3233       43770 :   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
    3234       21885 :   MO.ChangeToRegister(Reg, false);
    3235       21885 : }
    3236             : 
    3237       30646 : unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
    3238             :                                          MachineRegisterInfo &MRI,
    3239             :                                          MachineOperand &SuperReg,
    3240             :                                          const TargetRegisterClass *SuperRC,
    3241             :                                          unsigned SubIdx,
    3242             :                                          const TargetRegisterClass *SubRC)
    3243             :                                          const {
    3244       30646 :   MachineBasicBlock *MBB = MI->getParent();
    3245             :   DebugLoc DL = MI->getDebugLoc();
    3246       30646 :   unsigned SubReg = MRI.createVirtualRegister(SubRC);
    3247             : 
    3248       30646 :   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
    3249       61292 :     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    3250       30646 :       .addReg(SuperReg.getReg(), 0, SubIdx);
    3251       30646 :     return SubReg;
    3252             :   }
    3253             : 
    3254             :   // Just in case the super register is itself a sub-register, copy it to a new
    3255             :   // value so we don't need to worry about merging its subreg index with the
    3256             :   // SubIdx passed to this function. The register coalescer should be able to
    3257             :   // eliminate this extra copy.
    3258           0 :   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
    3259             : 
    3260           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
    3261           0 :     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
    3262             : 
    3263           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    3264           0 :     .addReg(NewSuperReg, 0, SubIdx);
    3265             : 
    3266           0 :   return SubReg;
    3267             : }
    3268             : 
    3269       30640 : MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
    3270             :   MachineBasicBlock::iterator MII,
    3271             :   MachineRegisterInfo &MRI,
    3272             :   MachineOperand &Op,
    3273             :   const TargetRegisterClass *SuperRC,
    3274             :   unsigned SubIdx,
    3275             :   const TargetRegisterClass *SubRC) const {
    3276       30640 :   if (Op.isImm()) {
    3277           0 :     if (SubIdx == AMDGPU::sub0)
    3278           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
    3279           0 :     if (SubIdx == AMDGPU::sub1)
    3280           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
    3281             : 
    3282           0 :     llvm_unreachable("Unhandled register index for immediate");
    3283             :   }
    3284             : 
    3285       30640 :   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
    3286             :                                        SubIdx, SubRC);
    3287             :   return MachineOperand::CreateReg(SubReg, false);
    3288             : }
    3289             : 
    3290             : // Change the order of operands from (0, 1, 2) to (0, 2, 1)
    3291        4407 : void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
    3292             :   assert(Inst.getNumExplicitOperands() == 3);
    3293        4407 :   MachineOperand Op1 = Inst.getOperand(1);
    3294        4407 :   Inst.RemoveOperand(1);
    3295        4407 :   Inst.addOperand(Op1);
    3296        4407 : }
    3297             : 
    3298      391929 : bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
    3299             :                                     const MCOperandInfo &OpInfo,
    3300             :                                     const MachineOperand &MO) const {
    3301      391929 :   if (!MO.isReg())
    3302             :     return false;
    3303             : 
    3304      391386 :   unsigned Reg = MO.getReg();
    3305             :   const TargetRegisterClass *RC =
    3306      391386 :     TargetRegisterInfo::isVirtualRegister(Reg) ?
    3307             :     MRI.getRegClass(Reg) :
    3308       11384 :     RI.getPhysRegClass(Reg);
    3309             : 
    3310             :   const SIRegisterInfo *TRI =
    3311      391386 :       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
    3312      391386 :   RC = TRI->getSubRegClass(RC, MO.getSubReg());
    3313             : 
    3314             :   // In order to be legal, the common sub-class must be equal to the
    3315             :   // class of the current operand.  For example:
    3316             :   //
    3317             :   // v_mov_b32 s0 ; Operand defined as vsrc_b32
    3318             :   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
    3319             :   //
    3320             :   // s_sendmsg 0, s0 ; Operand defined as m0reg
    3321             :   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
    3322             : 
    3323      782772 :   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
    3324             : }
    3325             : 
    3326           0 : bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
    3327             :                                      const MCOperandInfo &OpInfo,
    3328             :                                      const MachineOperand &MO) const {
    3329           0 :   if (MO.isReg())
    3330           0 :     return isLegalRegOperand(MRI, OpInfo, MO);
    3331             : 
    3332             :   // Handle non-register types that are treated like immediates.
    3333             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    3334             :   return true;
    3335             : }
    3336             : 
    3337      523716 : bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
    3338             :                                  const MachineOperand *MO) const {
    3339      523716 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    3340      523716 :   const MCInstrDesc &InstDesc = MI.getDesc();
    3341      523716 :   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
    3342             :   const TargetRegisterClass *DefinedRC =
    3343      523716 :       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
    3344      523716 :   if (!MO)
    3345           0 :     MO = &MI.getOperand(OpIdx);
    3346             : 
    3347      523716 :   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
    3348             : 
    3349             :     RegSubRegPair SGPRUsed;
    3350      141176 :     if (MO->isReg())
    3351      117120 :       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
    3352             : 
    3353      746675 :     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
    3354      655857 :       if (i == OpIdx)
    3355             :         continue;
    3356      529411 :       const MachineOperand &Op = MI.getOperand(i);
    3357      529411 :       if (Op.isReg()) {
    3358      814783 :         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
    3359      379432 :             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
    3360             :           return false;
    3361             :         }
    3362       94060 :       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
    3363             :         return false;
    3364             :       }
    3365             :     }
    3366             :   }
    3367             : 
    3368      473358 :   if (MO->isReg()) {
    3369             :     assert(DefinedRC);
    3370      380919 :     return isLegalRegOperand(MRI, OpInfo, *MO);
    3371             :   }
    3372             : 
    3373             :   // Handle non-register types that are treated like immediates.
    3374             :   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
    3375             : 
    3376       92439 :   if (!DefinedRC) {
    3377             :     // This operand expects an immediate.
    3378             :     return true;
    3379             :   }
    3380             : 
    3381       92439 :   return isImmOperandLegal(MI, OpIdx, *MO);
    3382             : }
    3383             : 
    3384        6062 : void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
    3385             :                                        MachineInstr &MI) const {
    3386        6062 :   unsigned Opc = MI.getOpcode();
    3387        6062 :   const MCInstrDesc &InstrDesc = get(Opc);
    3388             : 
    3389        6062 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    3390        6062 :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    3391             : 
    3392             :   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
    3393             :   // we need to only have one constant bus use.
    3394             :   //
    3395             :   // Note we do not need to worry about literal constants here. They are
    3396             :   // disabled for the operand type for instructions because they will always
    3397             :   // violate the one constant bus use rule.
    3398        6062 :   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
    3399        6062 :   if (HasImplicitSGPR) {
    3400         241 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3401         241 :     MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3402             : 
    3403         241 :     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
    3404          52 :       legalizeOpWithMove(MI, Src0Idx);
    3405             :   }
    3406             : 
    3407             :   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
    3408             :   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
    3409             :   // src0/src1 with V_READFIRSTLANE.
    3410        6062 :   if (Opc == AMDGPU::V_WRITELANE_B32) {
    3411           2 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3412           2 :     MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3413             :     const DebugLoc &DL = MI.getDebugLoc();
    3414           2 :     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
    3415           0 :       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3416           0 :       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3417             :           .add(Src0);
    3418           0 :       Src0.ChangeToRegister(Reg, false);
    3419             :     }
    3420           2 :     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
    3421           2 :       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3422             :       const DebugLoc &DL = MI.getDebugLoc();
    3423           4 :       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3424             :           .add(Src1);
    3425           2 :       Src1.ChangeToRegister(Reg, false);
    3426             :     }
    3427           2 :     return;
    3428             :   }
    3429             : 
    3430             :   // VOP2 src0 instructions support all operand types, so we don't need to check
    3431             :   // their legality. If src1 is already legal, we don't need to do anything.
    3432        6060 :   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
    3433             :     return;
    3434             : 
    3435             :   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
    3436             :   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
    3437             :   // select is uniform.
    3438        5141 :   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
    3439           1 :       RI.isVGPR(MRI, Src1.getReg())) {
    3440           1 :     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3441             :     const DebugLoc &DL = MI.getDebugLoc();
    3442           2 :     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3443             :         .add(Src1);
    3444           1 :     Src1.ChangeToRegister(Reg, false);
    3445           1 :     return;
    3446             :   }
    3447             : 
    3448             :   // We do not use commuteInstruction here because it is too aggressive and will
    3449             :   // commute if it is possible. We only want to commute here if it improves
    3450             :   // legality. This can be called a fairly large number of times so don't waste
    3451             :   // compile time pointlessly swapping and checking legality again.
    3452       10089 :   if (HasImplicitSGPR || !MI.isCommutable()) {
    3453         189 :     legalizeOpWithMove(MI, Src1Idx);
    3454         189 :     return;
    3455             :   }
    3456             : 
    3457        4950 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3458        4950 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3459             : 
    3460             :   // If src0 can be used as src1, commuting will make the operands legal.
    3461             :   // Otherwise we have to give up and insert a move.
    3462             :   //
    3463             :   // TODO: Other immediate-like operand kinds could be commuted if there was a
    3464             :   // MachineOperand::ChangeTo* for them.
    3465        9900 :   if ((!Src1.isImm() && !Src1.isReg()) ||
    3466        4950 :       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
    3467           0 :     legalizeOpWithMove(MI, Src1Idx);
    3468           0 :     return;
    3469             :   }
    3470             : 
    3471             :   int CommutedOpc = commuteOpcode(MI);
    3472        4950 :   if (CommutedOpc == -1) {
    3473           0 :     legalizeOpWithMove(MI, Src1Idx);
    3474           0 :     return;
    3475             :   }
    3476             : 
    3477        4950 :   MI.setDesc(get(CommutedOpc));
    3478             : 
    3479        4950 :   unsigned Src0Reg = Src0.getReg();
    3480             :   unsigned Src0SubReg = Src0.getSubReg();
    3481             :   bool Src0Kill = Src0.isKill();
    3482             : 
    3483        4950 :   if (Src1.isImm())
    3484         543 :     Src0.ChangeToImmediate(Src1.getImm());
    3485        4407 :   else if (Src1.isReg()) {
    3486        4407 :     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
    3487             :     Src0.setSubReg(Src1.getSubReg());
    3488             :   } else
    3489           0 :     llvm_unreachable("Should only have register or immediate operands");
    3490             : 
    3491        4950 :   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
    3492             :   Src1.setSubReg(Src0SubReg);
    3493             : }
    3494             : 
    3495             : // Legalize VOP3 operands. Because all operand types are supported for any
    3496             : // operand, and since literal constants are not allowed and should never be
    3497             : // seen, we only need to worry about inserting copies if we use multiple SGPR
    3498             : // operands.
    3499       66473 : void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
    3500             :                                        MachineInstr &MI) const {
    3501       66473 :   unsigned Opc = MI.getOpcode();
    3502             : 
    3503             :   int VOP3Idx[3] = {
    3504       66473 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
    3505       66473 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
    3506       66473 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
    3507      199419 :   };
    3508             : 
    3509             :   // Find the one SGPR operand we are allowed to use.
    3510       66473 :   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
    3511             : 
    3512      213758 :   for (unsigned i = 0; i < 3; ++i) {
    3513      193492 :     int Idx = VOP3Idx[i];
    3514      193492 :     if (Idx == -1)
    3515             :       break;
    3516      147285 :     MachineOperand &MO = MI.getOperand(Idx);
    3517             : 
    3518             :     // We should never see a VOP3 instruction with an illegal immediate operand.
    3519      147285 :     if (!MO.isReg())
    3520             :       continue;
    3521             : 
    3522      135045 :     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
    3523             :       continue; // VGPRs are legal
    3524             : 
    3525       71636 :     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
    3526       49992 :       SGPRReg = MO.getReg();
    3527             :       // We can use one SGPR in each VOP3 instruction.
    3528       49992 :       continue;
    3529             :     }
    3530             : 
    3531             :     // If we make it this far, then the operand is not legal and we must
    3532             :     // legalize it.
    3533       21644 :     legalizeOpWithMove(MI, Idx);
    3534             :   }
    3535       66473 : }
    3536             : 
    3537          57 : unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
    3538             :                                          MachineRegisterInfo &MRI) const {
    3539             :   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
    3540          57 :   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
    3541          57 :   unsigned DstReg = MRI.createVirtualRegister(SRC);
    3542          57 :   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
    3543             : 
    3544          57 :   if (SubRegs == 1) {
    3545          32 :     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3546          32 :             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
    3547          16 :         .addReg(SrcReg);
    3548          16 :     return DstReg;
    3549             :   }
    3550             : 
    3551             :   SmallVector<unsigned, 8> SRegs;
    3552         155 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3553         114 :     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3554         228 :     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3555         228 :             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
    3556         114 :         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
    3557         114 :     SRegs.push_back(SGPR);
    3558             :   }
    3559             : 
    3560             :   MachineInstrBuilder MIB =
    3561          41 :       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3562          82 :               get(AMDGPU::REG_SEQUENCE), DstReg);
    3563         155 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3564         228 :     MIB.addReg(SRegs[i]);
    3565         114 :     MIB.addImm(RI.getSubRegFromChannel(i));
    3566             :   }
    3567             :   return DstReg;
    3568             : }
    3569             : 
    3570          47 : void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
    3571             :                                        MachineInstr &MI) const {
    3572             : 
    3573             :   // If the pointer is store in VGPRs, then we need to move them to
    3574             :   // SGPRs using v_readfirstlane.  This is safe because we only select
    3575             :   // loads with uniform pointers to SMRD instruction so we know the
    3576             :   // pointer value is uniform.
    3577          47 :   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
    3578          47 :   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
    3579          33 :     unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
    3580          33 :     SBase->setReg(SGPR);
    3581             :   }
    3582          47 :   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
    3583          47 :   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
    3584          14 :     unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
    3585          14 :     SOff->setReg(SGPR);
    3586             :   }
    3587          47 : }
    3588             : 
    3589       25333 : void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
    3590             :                                          MachineBasicBlock::iterator I,
    3591             :                                          const TargetRegisterClass *DstRC,
    3592             :                                          MachineOperand &Op,
    3593             :                                          MachineRegisterInfo &MRI,
    3594             :                                          const DebugLoc &DL) const {
    3595       25333 :   unsigned OpReg = Op.getReg();
    3596             :   unsigned OpSubReg = Op.getSubReg();
    3597             : 
    3598       25333 :   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
    3599             :       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
    3600             : 
    3601             :   // Check if operand is already the correct register class.
    3602       25333 :   if (DstRC == OpRC)
    3603             :     return;
    3604             : 
    3605       24962 :   unsigned DstReg = MRI.createVirtualRegister(DstRC);
    3606             :   MachineInstr *Copy =
    3607       49924 :       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
    3608             : 
    3609       24962 :   Op.setReg(DstReg);
    3610             :   Op.setSubReg(0);
    3611             : 
    3612       24962 :   MachineInstr *Def = MRI.getVRegDef(OpReg);
    3613       24962 :   if (!Def)
    3614             :     return;
    3615             : 
    3616             :   // Try to eliminate the copy if it is copying an immediate value.
    3617       24962 :   if (Def->isMoveImmediate())
    3618        6554 :     FoldImmediate(*Copy, *Def, OpReg, &MRI);
    3619             : }
    3620             : 
    3621             : // Emit the actual waterfall loop, executing the wrapped instruction for each
    3622             : // unique value of \p Rsrc across all lanes. In the best case we execute 1
    3623             : // iteration, in the worst case we execute 64 (once per lane).
    3624             : static void
    3625           0 : emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
    3626             :                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
    3627             :                           const DebugLoc &DL, MachineOperand &Rsrc) {
    3628           0 :   MachineBasicBlock::iterator I = LoopBB.begin();
    3629             : 
    3630           0 :   unsigned VRsrc = Rsrc.getReg();
    3631             :   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
    3632             : 
    3633           0 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3634           0 :   unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3635           0 :   unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3636           0 :   unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3637           0 :   unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3638           0 :   unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3639           0 :   unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3640           0 :   unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3641           0 :   unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
    3642             : 
    3643             :   // Beginning of the loop, read the next Rsrc variant.
    3644           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
    3645           0 :       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
    3646           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
    3647           0 :       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
    3648           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
    3649           0 :       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
    3650           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
    3651           0 :       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
    3652             : 
    3653           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
    3654           0 :       .addReg(SRsrcSub0)
    3655             :       .addImm(AMDGPU::sub0)
    3656           0 :       .addReg(SRsrcSub1)
    3657             :       .addImm(AMDGPU::sub1)
    3658           0 :       .addReg(SRsrcSub2)
    3659             :       .addImm(AMDGPU::sub2)
    3660           0 :       .addReg(SRsrcSub3)
    3661             :       .addImm(AMDGPU::sub3);
    3662             : 
    3663             :   // Update Rsrc operand to use the SGPR Rsrc.
    3664           0 :   Rsrc.setReg(SRsrc);
    3665             :   Rsrc.setIsKill(true);
    3666             : 
    3667             :   // Identify all lanes with identical Rsrc operands in their VGPRs.
    3668           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
    3669           0 :       .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
    3670           0 :       .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
    3671           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
    3672           0 :       .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
    3673           0 :       .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
    3674           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
    3675           0 :       .addReg(CondReg0)
    3676           0 :       .addReg(CondReg1);
    3677             : 
    3678             :   MRI.setSimpleHint(SaveExec, AndCond);
    3679             : 
    3680             :   // Update EXEC to matching lanes, saving original to SaveExec.
    3681           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
    3682           0 :       .addReg(AndCond, RegState::Kill);
    3683             : 
    3684             :   // The original instruction is here; we insert the terminators after it.
    3685           0 :   I = LoopBB.end();
    3686             : 
    3687             :   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
    3688           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
    3689           0 :       .addReg(AMDGPU::EXEC)
    3690           0 :       .addReg(SaveExec);
    3691           0 :   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
    3692           0 : }
    3693             : 
    3694             : // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
    3695             : // with SGPRs by iterating over all unique values across all lanes.
    3696          17 : static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
    3697             :                               MachineOperand &Rsrc, MachineDominatorTree *MDT) {
    3698          17 :   MachineBasicBlock &MBB = *MI.getParent();
    3699          17 :   MachineFunction &MF = *MBB.getParent();
    3700          17 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3701             :   MachineBasicBlock::iterator I(&MI);
    3702             :   const DebugLoc &DL = MI.getDebugLoc();
    3703             : 
    3704          17 :   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    3705             : 
    3706             :   // Save the EXEC mask
    3707          51 :   BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
    3708          17 :       .addReg(AMDGPU::EXEC);
    3709             : 
    3710             :   // Killed uses in the instruction we are waterfalling around will be
    3711             :   // incorrect due to the added control-flow.
    3712         152 :   for (auto &MO : MI.uses()) {
    3713         135 :     if (MO.isReg() && MO.isUse()) {
    3714          60 :       MRI.clearKillFlags(MO.getReg());
    3715             :     }
    3716             :   }
    3717             : 
    3718             :   // To insert the loop we need to split the block. Move everything after this
    3719             :   // point to a new block, and insert a new empty block between the two.
    3720          17 :   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
    3721          17 :   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
    3722             :   MachineFunction::iterator MBBI(MBB);
    3723             :   ++MBBI;
    3724             : 
    3725             :   MF.insert(MBBI, LoopBB);
    3726             :   MF.insert(MBBI, RemainderBB);
    3727             : 
    3728          17 :   LoopBB->addSuccessor(LoopBB);
    3729          17 :   LoopBB->addSuccessor(RemainderBB);
    3730             : 
    3731             :   // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
    3732          17 :   MachineBasicBlock::iterator J = I++;
    3733          17 :   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
    3734             :   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    3735          17 :   LoopBB->splice(LoopBB->begin(), &MBB, J);
    3736             : 
    3737          17 :   MBB.addSuccessor(LoopBB);
    3738             : 
    3739             :   // Update dominators. We know that MBB immediately dominates LoopBB, that
    3740             :   // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
    3741             :   // dominates all of the successors transferred to it from MBB that MBB used
    3742             :   // to dominate.
    3743          17 :   if (MDT) {
    3744             :     MDT->addNewBlock(LoopBB, &MBB);
    3745             :     MDT->addNewBlock(RemainderBB, LoopBB);
    3746          25 :     for (auto &Succ : RemainderBB->successors()) {
    3747          16 :       if (MDT->dominates(&MBB, Succ)) {
    3748           6 :         MDT->changeImmediateDominator(Succ, RemainderBB);
    3749             :       }
    3750             :     }
    3751             :   }
    3752             : 
    3753          17 :   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
    3754             : 
    3755             :   // Restore the EXEC mask
    3756          17 :   MachineBasicBlock::iterator First = RemainderBB->begin();
    3757          34 :   BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    3758          17 :       .addReg(SaveExec);
    3759          17 : }
    3760             : 
    3761             : // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
    3762             : static std::tuple<unsigned, unsigned>
    3763           6 : extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
    3764           6 :   MachineBasicBlock &MBB = *MI.getParent();
    3765           6 :   MachineFunction &MF = *MBB.getParent();
    3766           6 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3767             : 
    3768             :   // Extract the ptr from the resource descriptor.
    3769             :   unsigned RsrcPtr =
    3770           6 :       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
    3771             :                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
    3772             : 
    3773             :   // Create an empty resource descriptor
    3774           6 :   unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3775           6 :   unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3776           6 :   unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3777           6 :   unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
    3778           6 :   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
    3779             : 
    3780             :   // Zero64 = 0
    3781          12 :   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
    3782             :       .addImm(0);
    3783             : 
    3784             :   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
    3785          12 :   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
    3786           6 :       .addImm(RsrcDataFormat & 0xFFFFFFFF);
    3787             : 
    3788             :   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
    3789          12 :   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
    3790           6 :       .addImm(RsrcDataFormat >> 32);
    3791             : 
    3792             :   // NewSRsrc = {Zero64, SRsrcFormat}
    3793          12 :   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
    3794           6 :       .addReg(Zero64)
    3795             :       .addImm(AMDGPU::sub0_sub1)
    3796           6 :       .addReg(SRsrcFormatLo)
    3797             :       .addImm(AMDGPU::sub2)
    3798           6 :       .addReg(SRsrcFormatHi)
    3799             :       .addImm(AMDGPU::sub3);
    3800             : 
    3801           6 :   return std::make_tuple(RsrcPtr, NewSRsrc);
    3802             : }
    3803             : 
    3804       82030 : void SIInstrInfo::legalizeOperands(MachineInstr &MI,
    3805             :                                    MachineDominatorTree *MDT) const {
    3806       82030 :   MachineFunction &MF = *MI.getParent()->getParent();
    3807       82030 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3808             : 
    3809             :   // Legalize VOP2
    3810       82030 :   if (isVOP2(MI) || isVOPC(MI)) {
    3811        6062 :     legalizeOperandsVOP2(MRI, MI);
    3812        6062 :     return;
    3813             :   }
    3814             : 
    3815             :   // Legalize VOP3
    3816       75968 :   if (isVOP3(MI)) {
    3817       27530 :     legalizeOperandsVOP3(MRI, MI);
    3818       27530 :     return;
    3819             :   }
    3820             : 
    3821             :   // Legalize SMRD
    3822       48438 :   if (isSMRD(MI)) {
    3823          47 :     legalizeOperandsSMRD(MRI, MI);
    3824          47 :     return;
    3825             :   }
    3826             : 
    3827             :   // Legalize REG_SEQUENCE and PHI
    3828             :   // The register class of the operands much be the same type as the register
    3829             :   // class of the output.
    3830       48391 :   if (MI.getOpcode() == AMDGPU::PHI) {
    3831             :     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
    3832        1412 :     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
    3833        1880 :       if (!MI.getOperand(i).isReg() ||
    3834         940 :           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
    3835             :         continue;
    3836             :       const TargetRegisterClass *OpRC =
    3837             :           MRI.getRegClass(MI.getOperand(i).getReg());
    3838         940 :       if (RI.hasVGPRs(OpRC)) {
    3839             :         VRC = OpRC;
    3840             :       } else {
    3841             :         SRC = OpRC;
    3842             :       }
    3843             :     }
    3844             : 
    3845             :     // If any of the operands are VGPR registers, then they all most be
    3846             :     // otherwise we will create illegal VGPR->SGPR copies when legalizing
    3847             :     // them.
    3848         608 :     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
    3849         472 :       if (!VRC) {
    3850             :         assert(SRC);
    3851         136 :         VRC = RI.getEquivalentVGPRClass(SRC);
    3852             :       }
    3853             :       RC = VRC;
    3854             :     } else {
    3855             :       RC = SRC;
    3856             :     }
    3857             : 
    3858             :     // Update all the operands so they have the same type.
    3859        1412 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3860         940 :       MachineOperand &Op = MI.getOperand(I);
    3861         940 :       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3862           0 :         continue;
    3863             : 
    3864             :       // MI is a PHI instruction.
    3865         940 :       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
    3866         940 :       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
    3867             : 
    3868             :       // Avoid creating no-op copies with the same src and dst reg class.  These
    3869             :       // confuse some of the machine passes.
    3870         940 :       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
    3871             :     }
    3872             :   }
    3873             : 
    3874             :   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
    3875             :   // VGPR dest type and SGPR sources, insert copies so all operands are
    3876             :   // VGPRs. This seems to help operand folding / the register coalescer.
    3877       96782 :   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
    3878       19533 :     MachineBasicBlock *MBB = MI.getParent();
    3879       19533 :     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
    3880       19533 :     if (RI.hasVGPRs(DstRC)) {
    3881             :       // Update all the operands so they are VGPR register classes. These may
    3882             :       // not be the same register class because REG_SEQUENCE supports mixing
    3883             :       // subregister index types e.g. sub0_sub1 + sub2 + sub3
    3884       71181 :       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3885       51648 :         MachineOperand &Op = MI.getOperand(I);
    3886       51648 :         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3887             :           continue;
    3888             : 
    3889             :         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
    3890       51648 :         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
    3891       51648 :         if (VRC == OpRC)
    3892             :           continue;
    3893             : 
    3894       24358 :         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
    3895             :         Op.setIsKill();
    3896             :       }
    3897             :     }
    3898             : 
    3899       19533 :     return;
    3900             :   }
    3901             : 
    3902             :   // Legalize INSERT_SUBREG
    3903             :   // src0 must have the same register class as dst
    3904       28858 :   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
    3905           7 :     unsigned Dst = MI.getOperand(0).getReg();
    3906           7 :     unsigned Src0 = MI.getOperand(1).getReg();
    3907             :     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
    3908             :     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
    3909           7 :     if (DstRC != Src0RC) {
    3910           5 :       MachineBasicBlock *MBB = MI.getParent();
    3911             :       MachineOperand &Op = MI.getOperand(1);
    3912           5 :       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
    3913             :     }
    3914           7 :     return;
    3915             :   }
    3916             : 
    3917             :   // Legalize SI_INIT_M0
    3918       28851 :   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
    3919           2 :     MachineOperand &Src = MI.getOperand(0);
    3920           2 :     if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
    3921           2 :       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
    3922           2 :     return;
    3923             :   }
    3924             : 
    3925             :   // Legalize MIMG and MUBUF/MTBUF for shaders.
    3926             :   //
    3927             :   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
    3928             :   // scratch memory access. In both cases, the legalization never involves
    3929             :   // conversion to the addr64 form.
    3930       28849 :   if (isMIMG(MI) ||
    3931       57690 :       (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
    3932         488 :        (isMUBUF(MI) || isMTBUF(MI)))) {
    3933           8 :     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
    3934           8 :     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
    3935           6 :       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
    3936           6 :       SRsrc->setReg(SGPR);
    3937             :     }
    3938             : 
    3939           8 :     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
    3940           8 :     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
    3941           2 :       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
    3942           2 :       SSamp->setReg(SGPR);
    3943             :     }
    3944           8 :     return;
    3945             :   }
    3946             : 
    3947             :   // Legalize MUBUF* instructions.
    3948             :   int RsrcIdx =
    3949       28841 :       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
    3950       28841 :   if (RsrcIdx != -1) {
    3951             :     // We have an MUBUF instruction
    3952          23 :     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
    3953          23 :     unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
    3954          69 :     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
    3955             :                              RI.getRegClass(RsrcRC))) {
    3956             :       // The operands are legal.
    3957             :       // FIXME: We may need to legalize operands besided srsrc.
    3958             :       return;
    3959             :     }
    3960             : 
    3961             :     // Legalize a VGPR Rsrc.
    3962             :     //
    3963             :     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
    3964             :     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
    3965             :     // a zero-value SRsrc.
    3966             :     //
    3967             :     // If the instruction is _OFFSET (both idxen and offen disabled), and we
    3968             :     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
    3969             :     // above.
    3970             :     //
    3971             :     // Otherwise we are on non-ADDR64 hardware, and/or we have
    3972             :     // idxen/offen/bothen and we fall back to a waterfall loop.
    3973             : 
    3974          23 :     MachineBasicBlock &MBB = *MI.getParent();
    3975             : 
    3976          23 :     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    3977          23 :     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
    3978             :       // This is already an ADDR64 instruction so we need to add the pointer
    3979             :       // extracted from the resource descriptor to the current value of VAddr.
    3980           4 :       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3981           4 :       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3982           4 :       unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    3983             : 
    3984             :       unsigned RsrcPtr, NewSRsrc;
    3985           4 :       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
    3986             : 
    3987             :       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
    3988             :       DebugLoc DL = MI.getDebugLoc();
    3989           8 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
    3990           4 :           .addReg(RsrcPtr, 0, AMDGPU::sub0)
    3991           4 :           .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
    3992             : 
    3993             :       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
    3994           8 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
    3995           4 :           .addReg(RsrcPtr, 0, AMDGPU::sub1)
    3996           4 :           .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
    3997             : 
    3998             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    3999           8 :       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
    4000           4 :           .addReg(NewVAddrLo)
    4001             :           .addImm(AMDGPU::sub0)
    4002           4 :           .addReg(NewVAddrHi)
    4003             :           .addImm(AMDGPU::sub1);
    4004             : 
    4005           4 :       VAddr->setReg(NewVAddr);
    4006           4 :       Rsrc->setReg(NewSRsrc);
    4007          19 :     } else if (!VAddr && ST.hasAddr64()) {
    4008             :       // This instructions is the _OFFSET variant, so we need to convert it to
    4009             :       // ADDR64.
    4010             :       assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
    4011             :              < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
    4012             :              "FIXME: Need to emit flat atomics here");
    4013             : 
    4014             :       unsigned RsrcPtr, NewSRsrc;
    4015           2 :       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
    4016             : 
    4017           2 :       unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4018           2 :       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
    4019           2 :       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    4020           2 :       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
    4021           2 :       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
    4022             : 
    4023             :       // Atomics rith return have have an additional tied operand and are
    4024             :       // missing some of the special bits.
    4025           2 :       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
    4026             :       MachineInstr *Addr64;
    4027             : 
    4028           2 :       if (!VDataIn) {
    4029             :         // Regular buffer load / store.
    4030             :         MachineInstrBuilder MIB =
    4031           4 :             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    4032             :                 .add(*VData)
    4033           2 :                 .addReg(NewVAddr)
    4034           2 :                 .addReg(NewSRsrc)
    4035             :                 .add(*SOffset)
    4036           2 :                 .add(*Offset);
    4037             : 
    4038             :         // Atomics do not have this operand.
    4039           2 :         if (const MachineOperand *GLC =
    4040           2 :                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
    4041           2 :           MIB.addImm(GLC->getImm());
    4042             :         }
    4043             : 
    4044             :         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
    4045             : 
    4046           2 :         if (const MachineOperand *TFE =
    4047           2 :                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
    4048           2 :           MIB.addImm(TFE->getImm());
    4049             :         }
    4050             : 
    4051             :         MIB.cloneMemRefs(MI);
    4052             :         Addr64 = MIB;
    4053             :       } else {
    4054             :         // Atomics with return.
    4055           0 :         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    4056             :                      .add(*VData)
    4057             :                      .add(*VDataIn)
    4058           0 :                      .addReg(NewVAddr)
    4059           0 :                      .addReg(NewSRsrc)
    4060             :                      .add(*SOffset)
    4061             :                      .add(*Offset)
    4062             :                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
    4063           0 :                      .cloneMemRefs(MI);
    4064             :       }
    4065             : 
    4066           2 :       MI.removeFromParent();
    4067             : 
    4068             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    4069           2 :       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
    4070           2 :               NewVAddr)
    4071           2 :           .addReg(RsrcPtr, 0, AMDGPU::sub0)
    4072             :           .addImm(AMDGPU::sub0)
    4073           2 :           .addReg(RsrcPtr, 0, AMDGPU::sub1)
    4074             :           .addImm(AMDGPU::sub1);
    4075             :     } else {
    4076             :       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
    4077             :       // to SGPRs.
    4078          17 :       loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
    4079             :     }
    4080             :   }
    4081             : }
    4082             : 
    4083       38259 : void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
    4084             :                              MachineDominatorTree *MDT) const {
    4085             :   SetVectorType Worklist;
    4086       38259 :   Worklist.insert(&TopInst);
    4087             : 
    4088      133366 :   while (!Worklist.empty()) {
    4089             :     MachineInstr &Inst = *Worklist.pop_back_val();
    4090       95107 :     MachineBasicBlock *MBB = Inst.getParent();
    4091       95107 :     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    4092             : 
    4093       95107 :     unsigned Opcode = Inst.getOpcode();
    4094       95107 :     unsigned NewOpcode = getVALUOp(Inst);
    4095             : 
    4096             :     // Handle some special cases
    4097       95107 :     switch (Opcode) {
    4098             :     default:
    4099             :       break;
    4100        5086 :     case AMDGPU::S_ADD_U64_PSEUDO:
    4101             :     case AMDGPU::S_SUB_U64_PSEUDO:
    4102        5086 :       splitScalar64BitAddSub(Worklist, Inst, MDT);
    4103        5086 :       Inst.eraseFromParent();
    4104        5086 :       continue;
    4105        2749 :     case AMDGPU::S_ADD_I32:
    4106             :     case AMDGPU::S_SUB_I32:
    4107             :       // FIXME: The u32 versions currently selected use the carry.
    4108        2749 :       if (moveScalarAddSub(Worklist, Inst, MDT))
    4109             :         continue;
    4110             : 
    4111             :       // Default handling
    4112             :       break;
    4113          62 :     case AMDGPU::S_AND_B64:
    4114          62 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
    4115          62 :       Inst.eraseFromParent();
    4116          62 :       continue;
    4117             : 
    4118          92 :     case AMDGPU::S_OR_B64:
    4119          92 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
    4120          92 :       Inst.eraseFromParent();
    4121          92 :       continue;
    4122             : 
    4123         108 :     case AMDGPU::S_XOR_B64:
    4124         108 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
    4125         108 :       Inst.eraseFromParent();
    4126         108 :       continue;
    4127             : 
    4128          18 :     case AMDGPU::S_NOT_B64:
    4129          18 :       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
    4130          18 :       Inst.eraseFromParent();
    4131          18 :       continue;
    4132             : 
    4133          26 :     case AMDGPU::S_BCNT1_I32_B64:
    4134          26 :       splitScalar64BitBCNT(Worklist, Inst);
    4135          26 :       Inst.eraseFromParent();
    4136          26 :       continue;
    4137             : 
    4138        1811 :     case AMDGPU::S_BFE_I64:
    4139        1811 :       splitScalar64BitBFE(Worklist, Inst);
    4140        1811 :       Inst.eraseFromParent();
    4141        1811 :       continue;
    4142             : 
    4143        1308 :     case AMDGPU::S_LSHL_B32:
    4144        1308 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4145             :         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
    4146         614 :         swapOperands(Inst);
    4147             :       }
    4148             :       break;
    4149        2687 :     case AMDGPU::S_ASHR_I32:
    4150        2687 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4151             :         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
    4152        1617 :         swapOperands(Inst);
    4153             :       }
    4154             :       break;
    4155        3402 :     case AMDGPU::S_LSHR_B32:
    4156        3402 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4157             :         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
    4158        1996 :         swapOperands(Inst);
    4159             :       }
    4160             :       break;
    4161         144 :     case AMDGPU::S_LSHL_B64:
    4162         144 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4163             :         NewOpcode = AMDGPU::V_LSHLREV_B64;
    4164          68 :         swapOperands(Inst);
    4165             :       }
    4166             :       break;
    4167         175 :     case AMDGPU::S_ASHR_I64:
    4168         175 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4169             :         NewOpcode = AMDGPU::V_ASHRREV_I64;
    4170          58 :         swapOperands(Inst);
    4171             :       }
    4172             :       break;
    4173         109 :     case AMDGPU::S_LSHR_B64:
    4174         109 :       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4175             :         NewOpcode = AMDGPU::V_LSHRREV_B64;
    4176          54 :         swapOperands(Inst);
    4177             :       }
    4178             :       break;
    4179             : 
    4180          24 :     case AMDGPU::S_ABS_I32:
    4181          24 :       lowerScalarAbs(Worklist, Inst);
    4182          24 :       Inst.eraseFromParent();
    4183          24 :       continue;
    4184             : 
    4185          75 :     case AMDGPU::S_CBRANCH_SCC0:
    4186             :     case AMDGPU::S_CBRANCH_SCC1:
    4187             :       // Clear unused bits of vcc
    4188          75 :       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
    4189         150 :               AMDGPU::VCC)
    4190          75 :           .addReg(AMDGPU::EXEC)
    4191          75 :           .addReg(AMDGPU::VCC);
    4192          75 :       break;
    4193             : 
    4194             :     case AMDGPU::S_BFE_U64:
    4195             :     case AMDGPU::S_BFM_B64:
    4196             :       llvm_unreachable("Moving this op to VALU not implemented");
    4197             : 
    4198         263 :     case AMDGPU::S_PACK_LL_B32_B16:
    4199             :     case AMDGPU::S_PACK_LH_B32_B16:
    4200             :     case AMDGPU::S_PACK_HH_B32_B16:
    4201         263 :       movePackToVALU(Worklist, MRI, Inst);
    4202         263 :       Inst.eraseFromParent();
    4203         263 :       continue;
    4204             : 
    4205          15 :     case AMDGPU::S_XNOR_B32:
    4206          15 :       lowerScalarXnor(Worklist, Inst);
    4207          15 :       Inst.eraseFromParent();
    4208          15 :       continue;
    4209             : 
    4210           5 :     case AMDGPU::S_XNOR_B64:
    4211           5 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
    4212           5 :       Inst.eraseFromParent();
    4213           5 :       continue;
    4214             :     }
    4215             : 
    4216       82893 :     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
    4217             :       // We cannot move this instruction to the VALU, so we should try to
    4218             :       // legalize its operands instead.
    4219          89 :       legalizeOperands(Inst, MDT);
    4220          89 :       continue;
    4221             :     }
    4222             : 
    4223             :     // Use the new VALU Opcode.
    4224       87211 :     const MCInstrDesc &NewDesc = get(NewOpcode);
    4225             :     Inst.setDesc(NewDesc);
    4226             : 
    4227             :     // Remove any references to SCC. Vector instructions can't read from it, and
    4228             :     // We're just about to add the implicit use / defs of VCC, and we don't want
    4229             :     // both.
    4230      300753 :     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
    4231      213542 :       MachineOperand &Op = Inst.getOperand(i);
    4232      213542 :       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
    4233       20133 :         Inst.RemoveOperand(i);
    4234       20133 :         addSCCDefUsersToVALUWorklist(Inst, Worklist);
    4235             :       }
    4236             :     }
    4237             : 
    4238       87211 :     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
    4239             :       // We are converting these to a BFE, so we need to add the missing
    4240             :       // operands for the size and offset.
    4241        1573 :       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
    4242        1573 :       Inst.addOperand(MachineOperand::CreateImm(0));
    4243        3146 :       Inst.addOperand(MachineOperand::CreateImm(Size));
    4244             : 
    4245       85638 :     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
    4246             :       // The VALU version adds the second operand to the result, so insert an
    4247             :       // extra 0 operand.
    4248         128 :       Inst.addOperand(MachineOperand::CreateImm(0));
    4249             :     }
    4250             : 
    4251       87211 :     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
    4252             : 
    4253       87211 :     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
    4254        3292 :       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
    4255             :       // If we need to move this to VGPRs, we need to unpack the second operand
    4256             :       // back into the 2 separate ones for bit offset and width.
    4257             :       assert(OffsetWidthOp.isImm() &&
    4258             :              "Scalar BFE is only implemented for constant width and offset");
    4259        3292 :       uint32_t Imm = OffsetWidthOp.getImm();
    4260             : 
    4261        3292 :       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    4262        3292 :       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    4263        3292 :       Inst.RemoveOperand(2);                     // Remove old immediate.
    4264        6584 :       Inst.addOperand(MachineOperand::CreateImm(Offset));
    4265        6584 :       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
    4266             :     }
    4267             : 
    4268      174422 :     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
    4269             :     unsigned NewDstReg = AMDGPU::NoRegister;
    4270             :     if (HasDst) {
    4271       87061 :       unsigned DstReg = Inst.getOperand(0).getReg();
    4272       87061 :       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
    4273             :         continue;
    4274             : 
    4275             :       // Update the destination register class.
    4276       87004 :       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
    4277       87004 :       if (!NewDstRC)
    4278             :         continue;
    4279             : 
    4280       43981 :       if (Inst.isCopy() &&
    4281      130609 :           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
    4282       43605 :           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
    4283             :         // Instead of creating a copy where src and dst are the same register
    4284             :         // class, we just replace all uses of dst with src.  These kinds of
    4285             :         // copies interfere with the heuristics MachineSink uses to decide
    4286             :         // whether or not to split a critical edge.  Since the pass assumes
    4287             :         // that copies will end up as machine instructions and not be
    4288             :         // eliminated.
    4289       16216 :         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
    4290       16216 :         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
    4291       16216 :         MRI.clearKillFlags(Inst.getOperand(1).getReg());
    4292       16216 :         Inst.getOperand(0).setReg(DstReg);
    4293             : 
    4294             :         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
    4295             :         // these are deleted later, but at -O0 it would leave a suspicious
    4296             :         // looking illegal copy of an undef register.
    4297       32432 :         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
    4298       16216 :           Inst.RemoveOperand(I);
    4299       16216 :         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
    4300       16216 :         continue;
    4301             :       }
    4302             : 
    4303       70788 :       NewDstReg = MRI.createVirtualRegister(NewDstRC);
    4304       70788 :       MRI.replaceRegWith(DstReg, NewDstReg);
    4305             :     }
    4306             : 
    4307             :     // Legalize the operands
    4308       70938 :     legalizeOperands(Inst, MDT);
    4309             : 
    4310       70938 :     if (HasDst)
    4311       70788 :      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
    4312             :   }
    4313       38259 : }
    4314             : 
    4315             : // Add/sub require special handling to deal with carry outs.
    4316        2749 : bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
    4317             :                                    MachineDominatorTree *MDT) const {
    4318        2749 :   if (ST.hasAddNoCarry()) {
    4319             :     // Assume there is no user of scc since we don't select this in that case.
    4320             :     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
    4321             :     // is used.
    4322             : 
    4323         297 :     MachineBasicBlock &MBB = *Inst.getParent();
    4324         297 :     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4325             : 
    4326         297 :     unsigned OldDstReg = Inst.getOperand(0).getReg();
    4327         297 :     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4328             : 
    4329         297 :     unsigned Opc = Inst.getOpcode();
    4330             :     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
    4331             : 
    4332         297 :     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
    4333             :       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
    4334             : 
    4335             :     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
    4336         297 :     Inst.RemoveOperand(3);
    4337             : 
    4338         297 :     Inst.setDesc(get(NewOpc));
    4339         297 :     Inst.addImplicitDefUseOperands(*MBB.getParent());
    4340         297 :     MRI.replaceRegWith(OldDstReg, ResultReg);
    4341         297 :     legalizeOperands(Inst, MDT);
    4342             : 
    4343         297 :     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4344         297 :     return true;
    4345             :   }
    4346             : 
    4347             :   return false;
    4348             : }
    4349             : 
    4350          24 : void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
    4351             :                                  MachineInstr &Inst) const {
    4352          24 :   MachineBasicBlock &MBB = *Inst.getParent();
    4353          24 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4354             :   MachineBasicBlock::iterator MII = Inst;
    4355             :   DebugLoc DL = Inst.getDebugLoc();
    4356             : 
    4357          24 :   MachineOperand &Dest = Inst.getOperand(0);
    4358             :   MachineOperand &Src = Inst.getOperand(1);
    4359          24 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4360          24 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4361             : 
    4362          24 :   unsigned SubOp = ST.hasAddNoCarry() ?
    4363             :     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
    4364             : 
    4365          48 :   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
    4366             :     .addImm(0)
    4367          24 :     .addReg(Src.getReg());
    4368             : 
    4369          48 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
    4370          24 :     .addReg(Src.getReg())
    4371          24 :     .addReg(TmpReg);
    4372             : 
    4373          24 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4374          24 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4375          24 : }
    4376             : 
    4377          15 : void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
    4378             :                                   MachineInstr &Inst) const {
    4379          15 :   MachineBasicBlock &MBB = *Inst.getParent();
    4380          15 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4381             :   MachineBasicBlock::iterator MII = Inst;
    4382             :   const DebugLoc &DL = Inst.getDebugLoc();
    4383             : 
    4384          15 :   MachineOperand &Dest = Inst.getOperand(0);
    4385             :   MachineOperand &Src0 = Inst.getOperand(1);
    4386             :   MachineOperand &Src1 = Inst.getOperand(2);
    4387             : 
    4388          15 :   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
    4389          15 :   legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
    4390             : 
    4391          15 :   unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4392          15 :   if (ST.hasDLInsts()) {
    4393           6 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
    4394             :       .add(Src0)
    4395             :       .add(Src1);
    4396             :   } else {
    4397          12 :     unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4398          24 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
    4399             :       .add(Src0)
    4400             :       .add(Src1);
    4401             : 
    4402          36 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
    4403          12 :       .addReg(Xor);
    4404             :   }
    4405             : 
    4406          15 :   MRI.replaceRegWith(Dest.getReg(), NewDest);
    4407          15 :   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
    4408          15 : }
    4409             : 
    4410          18 : void SIInstrInfo::splitScalar64BitUnaryOp(
    4411             :     SetVectorType &Worklist, MachineInstr &Inst,
    4412             :     unsigned Opcode) const {
    4413          18 :   MachineBasicBlock &MBB = *Inst.getParent();
    4414          18 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4415             : 
    4416          18 :   MachineOperand &Dest = Inst.getOperand(0);
    4417             :   MachineOperand &Src0 = Inst.getOperand(1);
    4418             :   DebugLoc DL = Inst.getDebugLoc();
    4419             : 
    4420             :   MachineBasicBlock::iterator MII = Inst;
    4421             : 
    4422          18 :   const MCInstrDesc &InstDesc = get(Opcode);
    4423          18 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    4424          18 :     MRI.getRegClass(Src0.getReg()) :
    4425             :     &AMDGPU::SGPR_32RegClass;
    4426             : 
    4427          18 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4428             : 
    4429             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4430          18 :                                                        AMDGPU::sub0, Src0SubRC);
    4431             : 
    4432          18 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    4433          18 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    4434          18 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    4435             : 
    4436          18 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    4437          18 :   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
    4438             : 
    4439             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4440          18 :                                                        AMDGPU::sub1, Src0SubRC);
    4441             : 
    4442          18 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    4443          36 :   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
    4444             : 
    4445          18 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    4446          36 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4447          18 :     .addReg(DestSub0)
    4448             :     .addImm(AMDGPU::sub0)
    4449          18 :     .addReg(DestSub1)
    4450             :     .addImm(AMDGPU::sub1);
    4451             : 
    4452          18 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4453             : 
    4454             :   // We don't need to legalizeOperands here because for a single operand, src0
    4455             :   // will support any kind of input.
    4456             : 
    4457             :   // Move all users of this moved value.
    4458          18 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4459          18 : }
    4460             : 
    4461        5086 : void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
    4462             :                                          MachineInstr &Inst,
    4463             :                                          MachineDominatorTree *MDT) const {
    4464        5086 :   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
    4465             : 
    4466        5086 :   MachineBasicBlock &MBB = *Inst.getParent();
    4467        5086 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4468             : 
    4469        5086 :   unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4470        5086 :   unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4471        5086 :   unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4472             : 
    4473        5086 :   unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    4474        5086 :   unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
    4475             : 
    4476        5086 :   MachineOperand &Dest = Inst.getOperand(0);
    4477             :   MachineOperand &Src0 = Inst.getOperand(1);
    4478             :   MachineOperand &Src1 = Inst.getOperand(2);
    4479             :   const DebugLoc &DL = Inst.getDebugLoc();
    4480             :   MachineBasicBlock::iterator MII = Inst;
    4481             : 
    4482        5086 :   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
    4483        5086 :   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
    4484        5086 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4485        5086 :   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
    4486             : 
    4487             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4488        5086 :                                                        AMDGPU::sub0, Src0SubRC);
    4489             :   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4490        5086 :                                                        AMDGPU::sub0, Src1SubRC);
    4491             : 
    4492             : 
    4493             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4494        5086 :                                                        AMDGPU::sub1, Src0SubRC);
    4495             :   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4496        5086 :                                                        AMDGPU::sub1, Src1SubRC);
    4497             : 
    4498        5086 :   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
    4499             :   MachineInstr *LoHalf =
    4500       10172 :     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
    4501        5086 :     .addReg(CarryReg, RegState::Define)
    4502             :     .add(SrcReg0Sub0)
    4503        5086 :     .add(SrcReg1Sub0);
    4504             : 
    4505        5086 :   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
    4506             :   MachineInstr *HiHalf =
    4507       10172 :     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
    4508        5086 :     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
    4509             :     .add(SrcReg0Sub1)
    4510             :     .add(SrcReg1Sub1)
    4511        5086 :     .addReg(CarryReg, RegState::Kill);
    4512             : 
    4513       10172 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4514        5086 :     .addReg(DestSub0)
    4515             :     .addImm(AMDGPU::sub0)
    4516        5086 :     .addReg(DestSub1)
    4517             :     .addImm(AMDGPU::sub1);
    4518             : 
    4519        5086 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4520             : 
    4521             :   // Try to legalize the operands in case we need to swap the order to keep it
    4522             :   // valid.
    4523        5086 :   legalizeOperands(*LoHalf, MDT);
    4524        5086 :   legalizeOperands(*HiHalf, MDT);
    4525             : 
    4526             :   // Move all users of this moved vlaue.
    4527        5086 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4528        5086 : }
    4529             : 
    4530         267 : void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
    4531             :                                            MachineInstr &Inst, unsigned Opcode,
    4532             :                                            MachineDominatorTree *MDT) const {
    4533         267 :   MachineBasicBlock &MBB = *Inst.getParent();
    4534         267 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4535             : 
    4536         267 :   MachineOperand &Dest = Inst.getOperand(0);
    4537             :   MachineOperand &Src0 = Inst.getOperand(1);
    4538             :   MachineOperand &Src1 = Inst.getOperand(2);
    4539             :   DebugLoc DL = Inst.getDebugLoc();
    4540             : 
    4541             :   MachineBasicBlock::iterator MII = Inst;
    4542             : 
    4543         267 :   const MCInstrDesc &InstDesc = get(Opcode);
    4544         267 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    4545         267 :     MRI.getRegClass(Src0.getReg()) :
    4546             :     &AMDGPU::SGPR_32RegClass;
    4547             : 
    4548         267 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    4549         267 :   const TargetRegisterClass *Src1RC = Src1.isReg() ?
    4550         267 :     MRI.getRegClass(Src1.getReg()) :
    4551             :     &AMDGPU::SGPR_32RegClass;
    4552             : 
    4553         267 :   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
    4554             : 
    4555             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4556         267 :                                                        AMDGPU::sub0, Src0SubRC);
    4557             :   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4558         267 :                                                        AMDGPU::sub0, Src1SubRC);
    4559             : 
    4560         267 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    4561         267 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    4562         267 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    4563             : 
    4564         267 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    4565         267 :   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
    4566             :                               .add(SrcReg0Sub0)
    4567             :                               .add(SrcReg1Sub0);
    4568             : 
    4569             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    4570         267 :                                                        AMDGPU::sub1, Src0SubRC);
    4571             :   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    4572         267 :                                                        AMDGPU::sub1, Src1SubRC);
    4573             : 
    4574         267 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    4575         267 :   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
    4576             :                               .add(SrcReg0Sub1)
    4577             :                               .add(SrcReg1Sub1);
    4578             : 
    4579         267 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    4580         534 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    4581         267 :     .addReg(DestSub0)
    4582             :     .addImm(AMDGPU::sub0)
    4583         267 :     .addReg(DestSub1)
    4584             :     .addImm(AMDGPU::sub1);
    4585             : 
    4586         267 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    4587             : 
    4588             :   // Try to legalize the operands in case we need to swap the order to keep it
    4589             :   // valid.
    4590         267 :   legalizeOperands(LoHalf, MDT);
    4591         267 :   legalizeOperands(HiHalf, MDT);
    4592             : 
    4593             :   // Move all users of this moved vlaue.
    4594         267 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    4595         267 : }
    4596             : 
    4597          26 : void SIInstrInfo::splitScalar64BitBCNT(
    4598             :     SetVectorType &Worklist, MachineInstr &Inst) const {
    4599          26 :   MachineBasicBlock &MBB = *Inst.getParent();
    4600          26 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4601             : 
    4602             :   MachineBasicBlock::iterator MII = Inst;
    4603             :   DebugLoc DL = Inst.getDebugLoc();
    4604             : 
    4605          26 :   MachineOperand &Dest = Inst.getOperand(0);
    4606             :   MachineOperand &Src = Inst.getOperand(1);
    4607             : 
    4608          26 :   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
    4609          26 :   const TargetRegisterClass *SrcRC = Src.isReg() ?
    4610          26 :     MRI.getRegClass(Src.getReg()) :
    4611             :     &AMDGPU::SGPR_32RegClass;
    4612             : 
    4613          26 :   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4614          26 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4615             : 
    4616          26 :   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
    4617             : 
    4618             :   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    4619          26 :                                                       AMDGPU::sub0, SrcSubRC);
    4620             :   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    4621          26 :                                                       AMDGPU::sub1, SrcSubRC);
    4622             : 
    4623          26 :   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
    4624             : 
    4625          52 :   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
    4626             : 
    4627          26 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4628             : 
    4629             :   // We don't need to legalize operands here. src0 for etiher instruction can be
    4630             :   // an SGPR, and the second input is unused or determined here.
    4631          26 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4632          26 : }
    4633             : 
    4634        1811 : void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
    4635             :                                       MachineInstr &Inst) const {
    4636        1811 :   MachineBasicBlock &MBB = *Inst.getParent();
    4637        1811 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4638             :   MachineBasicBlock::iterator MII = Inst;
    4639             :   DebugLoc DL = Inst.getDebugLoc();
    4640             : 
    4641        1811 :   MachineOperand &Dest = Inst.getOperand(0);
    4642        1811 :   uint32_t Imm = Inst.getOperand(2).getImm();
    4643             :   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    4644        1811 :   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    4645             : 
    4646             :   (void) Offset;
    4647             : 
    4648             :   // Only sext_inreg cases handled.
    4649             :   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
    4650             :          Offset == 0 && "Not implemented");
    4651             : 
    4652        1811 :   if (BitWidth < 32) {
    4653        1805 :     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4654        1805 :     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4655        1805 :     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4656             : 
    4657        3610 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
    4658        1805 :         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
    4659             :         .addImm(0)
    4660        1805 :         .addImm(BitWidth);
    4661             : 
    4662        3610 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
    4663             :       .addImm(31)
    4664        1805 :       .addReg(MidRegLo);
    4665             : 
    4666        3610 :     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    4667        1805 :       .addReg(MidRegLo)
    4668             :       .addImm(AMDGPU::sub0)
    4669        1805 :       .addReg(MidRegHi)
    4670             :       .addImm(AMDGPU::sub1);
    4671             : 
    4672        1805 :     MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4673        1805 :     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4674             :     return;
    4675             :   }
    4676             : 
    4677             :   MachineOperand &Src = Inst.getOperand(1);
    4678           6 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4679           6 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    4680             : 
    4681          12 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
    4682             :     .addImm(31)
    4683           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0);
    4684             : 
    4685          12 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    4686           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0)
    4687             :     .addImm(AMDGPU::sub0)
    4688           6 :     .addReg(TmpReg)
    4689             :     .addImm(AMDGPU::sub1);
    4690             : 
    4691           6 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4692           6 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4693             : }
    4694             : 
    4695       94811 : void SIInstrInfo::addUsersToMoveToVALUWorklist(
    4696             :   unsigned DstReg,
    4697             :   MachineRegisterInfo &MRI,
    4698             :   SetVectorType &Worklist) const {
    4699       94811 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
    4700      212258 :          E = MRI.use_end(); I != E;) {
    4701      117447 :     MachineInstr &UseMI = *I->getParent();
    4702      117447 :     if (!canReadVGPR(UseMI, I.getOperandNo())) {
    4703       56780 :       Worklist.insert(&UseMI);
    4704             : 
    4705             :       do {
    4706             :         ++I;
    4707       56889 :       } while (I != E && I->getParent() == &UseMI);
    4708             :     } else {
    4709             :       ++I;
    4710             :     }
    4711             :   }
    4712       94811 : }
    4713             : 
    4714         263 : void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
    4715             :                                  MachineRegisterInfo &MRI,
    4716             :                                  MachineInstr &Inst) const {
    4717         263 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4718         263 :   MachineBasicBlock *MBB = Inst.getParent();
    4719         263 :   MachineOperand &Src0 = Inst.getOperand(1);
    4720             :   MachineOperand &Src1 = Inst.getOperand(2);
    4721             :   const DebugLoc &DL = Inst.getDebugLoc();
    4722             : 
    4723         526 :   switch (Inst.getOpcode()) {
    4724             :   case AMDGPU::S_PACK_LL_B32_B16: {
    4725         254 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4726         254 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4727             : 
    4728             :     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
    4729             :     // 0.
    4730         508 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4731             :       .addImm(0xffff);
    4732             : 
    4733         508 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
    4734         254 :       .addReg(ImmReg, RegState::Kill)
    4735             :       .add(Src0);
    4736             : 
    4737         508 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
    4738             :       .add(Src1)
    4739             :       .addImm(16)
    4740         254 :       .addReg(TmpReg, RegState::Kill);
    4741         254 :     break;
    4742             :   }
    4743             :   case AMDGPU::S_PACK_LH_B32_B16: {
    4744           6 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4745          12 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4746             :       .addImm(0xffff);
    4747          12 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
    4748           6 :       .addReg(ImmReg, RegState::Kill)
    4749             :       .add(Src0)
    4750             :       .add(Src1);
    4751           6 :     break;
    4752             :   }
    4753             :   case AMDGPU::S_PACK_HH_B32_B16: {
    4754           3 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4755           3 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4756           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
    4757             :       .addImm(16)
    4758             :       .add(Src0);
    4759           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4760             :       .addImm(0xffff0000);
    4761           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
    4762             :       .add(Src1)
    4763           3 :       .addReg(ImmReg, RegState::Kill)
    4764           3 :       .addReg(TmpReg, RegState::Kill);
    4765           3 :     break;
    4766             :   }
    4767           0 :   default:
    4768           0 :     llvm_unreachable("unhandled s_pack_* instruction");
    4769             :   }
    4770             : 
    4771         263 :   MachineOperand &Dest = Inst.getOperand(0);
    4772         263 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4773         263 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4774         263 : }
    4775             : 
    4776       20133 : void SIInstrInfo::addSCCDefUsersToVALUWorklist(
    4777             :     MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
    4778             :   // This assumes that all the users of SCC are in the same block
    4779             :   // as the SCC def.
    4780             :   for (MachineInstr &MI :
    4781             :        make_range(MachineBasicBlock::iterator(SCCDefInst),
    4782      582809 :                       SCCDefInst.getParent()->end())) {
    4783             :     // Exit if we find another SCC def.
    4784      576470 :     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
    4785             :       return;
    4786             : 
    4787      562676 :     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
    4788          75 :       Worklist.insert(&MI);
    4789             :   }
    4790             : }
    4791             : 
    4792       87004 : const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    4793             :   const MachineInstr &Inst) const {
    4794       87004 :   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
    4795             : 
    4796      174008 :   switch (Inst.getOpcode()) {
    4797             :   // For target instructions, getOpRegClass just returns the virtual register
    4798             :   // class associated with the operand, so we need to find an equivalent VGPR
    4799             :   // register class in order to move the instruction to the VALU.
    4800       64263 :   case AMDGPU::COPY:
    4801             :   case AMDGPU::PHI:
    4802             :   case AMDGPU::REG_SEQUENCE:
    4803             :   case AMDGPU::INSERT_SUBREG:
    4804             :   case AMDGPU::WQM:
    4805             :   case AMDGPU::WWM:
    4806       64263 :     if (RI.hasVGPRs(NewDstRC))
    4807             :       return nullptr;
    4808             : 
    4809       64263 :     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
    4810       64263 :     if (!NewDstRC)
    4811           0 :       return nullptr;
    4812             :     return NewDstRC;
    4813             :   default:
    4814             :     return NewDstRC;
    4815             :   }
    4816             : }
    4817             : 
    4818             : // Find the one SGPR operand we are allowed to use.
    4819       66473 : unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
    4820             :                                    int OpIndices[3]) const {
    4821       66473 :   const MCInstrDesc &Desc = MI.getDesc();
    4822             : 
    4823             :   // Find the one SGPR operand we are allowed to use.
    4824             :   //
    4825             :   // First we need to consider the instruction's operand requirements before
    4826             :   // legalizing. Some operands are required to be SGPRs, such as implicit uses
    4827             :   // of VCC, but we are still bound by the constant bus requirement to only use
    4828             :   // one.
    4829             :   //
    4830             :   // If the operand's class is an SGPR, we can never move it.
    4831             : 
    4832       66473 :   unsigned SGPRReg = findImplicitSGPRRead(MI);
    4833       66473 :   if (SGPRReg != AMDGPU::NoRegister)
    4834             :     return SGPRReg;
    4835             : 
    4836       66336 :   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
    4837       66336 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    4838             : 
    4839      203415 :   for (unsigned i = 0; i < 3; ++i) {
    4840      193081 :     int Idx = OpIndices[i];
    4841      193081 :     if (Idx == -1)
    4842             :       break;
    4843             : 
    4844      146874 :     const MachineOperand &MO = MI.getOperand(Idx);
    4845      146874 :     if (!MO.isReg())
    4846             :       continue;
    4847             : 
    4848             :     // Is this operand statically required to be an SGPR based on the operand
    4849             :     // constraints?
    4850      134634 :     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
    4851      134634 :     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
    4852      134634 :     if (IsRequiredSGPR)
    4853        9795 :       return MO.getReg();
    4854             : 
    4855             :     // If this could be a VGPR or an SGPR, Check the dynamic register class.
    4856      124839 :     unsigned Reg = MO.getReg();
    4857             :     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
    4858      124839 :     if (RI.isSGPRClass(RegRC))
    4859       61817 :       UsedSGPRs[i] = Reg;
    4860             :   }
    4861             : 
    4862             :   // We don't have a required SGPR operand, so we have a bit more freedom in
    4863             :   // selecting operands to move.
    4864             : 
    4865             :   // Try to select the most used SGPR. If an SGPR is equal to one of the
    4866             :   // others, we choose that.
    4867             :   //
    4868             :   // e.g.
    4869             :   // V_FMA_F32 v0, s0, s0, s0 -> No moves
    4870             :   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
    4871             : 
    4872             :   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
    4873             :   // prefer those.
    4874             : 
    4875       56541 :   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
    4876       27468 :     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
    4877             :       SGPRReg = UsedSGPRs[0];
    4878             :   }
    4879             : 
    4880       56541 :   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
    4881       21375 :     if (UsedSGPRs[1] == UsedSGPRs[2])
    4882             :       SGPRReg = UsedSGPRs[1];
    4883             :   }
    4884             : 
    4885             :   return SGPRReg;
    4886             : }
    4887             : 
    4888    14888322 : MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
    4889             :                                              unsigned OperandName) const {
    4890    14888322 :   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
    4891    14888322 :   if (Idx == -1)
    4892             :     return nullptr;
    4893             : 
    4894     5043442 :   return &MI.getOperand(Idx);
    4895             : }
    4896             : 
    4897       21399 : uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
    4898             :   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
    4899       42798 :   if (ST.isAmdHsaOS()) {
    4900             :     // Set ATC = 1. GFX9 doesn't have this bit.
    4901         688 :     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4902             :       RsrcDataFormat |= (1ULL << 56);
    4903             : 
    4904             :     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
    4905             :     // BTW, it disables TC L2 and therefore decreases performance.
    4906         688 :     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4907         351 :       RsrcDataFormat |= (2ULL << 59);
    4908             :   }
    4909             : 
    4910       21399 :   return RsrcDataFormat;
    4911             : }
    4912             : 
    4913         485 : uint64_t SIInstrInfo::getScratchRsrcWords23() const {
    4914         485 :   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
    4915             :                     AMDGPU::RSRC_TID_ENABLE |
    4916         485 :                     0xffffffff; // Size;
    4917             : 
    4918             :   // GFX9 doesn't have ELEMENT_SIZE.
    4919         485 :   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    4920         401 :     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
    4921         401 :     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
    4922             :   }
    4923             : 
    4924             :   // IndexStride = 64.
    4925         485 :   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
    4926             : 
    4927             :   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
    4928             :   // Clear them unless we want a huge stride.
    4929         485 :   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    4930         255 :     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
    4931             : 
    4932         485 :   return Rsrc23;
    4933             : }
    4934             : 
    4935          60 : bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
    4936          60 :   unsigned Opc = MI.getOpcode();
    4937             : 
    4938          60 :   return isSMRD(Opc);
    4939             : }
    4940             : 
    4941          14 : bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
    4942          14 :   unsigned Opc = MI.getOpcode();
    4943             : 
    4944          14 :   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
    4945             : }
    4946             : 
    4947        2829 : unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
    4948             :                                     int &FrameIndex) const {
    4949             :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    4950        2829 :   if (!Addr || !Addr->isFI())
    4951             :     return AMDGPU::NoRegister;
    4952             : 
    4953             :   assert(!MI.memoperands_empty() &&
    4954             :          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
    4955             : 
    4956        2074 :   FrameIndex = Addr->getIndex();
    4957        2074 :   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
    4958             : }
    4959             : 
    4960          28 : unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
    4961             :                                         int &FrameIndex) const {
    4962             :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
    4963             :   assert(Addr && Addr->isFI());
    4964          28 :   FrameIndex = Addr->getIndex();
    4965          28 :   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
    4966             : }
    4967             : 
    4968       18319 : unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
    4969             :                                           int &FrameIndex) const {
    4970       18319 :   if (!MI.mayLoad())
    4971             :     return AMDGPU::NoRegister;
    4972             : 
    4973        2167 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4974        1425 :     return isStackAccess(MI, FrameIndex);
    4975             : 
    4976         742 :   if (isSGPRSpill(MI))
    4977          27 :     return isSGPRStackAccess(MI, FrameIndex);
    4978             : 
    4979             :   return AMDGPU::NoRegister;
    4980             : }
    4981             : 
    4982        9084 : unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
    4983             :                                          int &FrameIndex) const {
    4984        9084 :   if (!MI.mayStore())
    4985             :     return AMDGPU::NoRegister;
    4986             : 
    4987        1896 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4988        1404 :     return isStackAccess(MI, FrameIndex);
    4989             : 
    4990         492 :   if (isSGPRSpill(MI))
    4991           1 :     return isSGPRStackAccess(MI, FrameIndex);
    4992             : 
    4993             :   return AMDGPU::NoRegister;
    4994             : }
    4995             : 
    4996        1216 : unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
    4997             :   unsigned Size = 0;
    4998        1216 :   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
    4999        1216 :   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
    5000        4864 :   while (++I != E && I->isInsideBundle()) {
    5001             :     assert(!I->isBundle() && "No nested bundle!");
    5002        3648 :     Size += getInstSizeInBytes(*I);
    5003             :   }
    5004             : 
    5005        1216 :   return Size;
    5006             : }
    5007             : 
    5008      732495 : unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
    5009      732495 :   unsigned Opc = MI.getOpcode();
    5010             :   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
    5011      732495 :   unsigned DescSize = Desc.getSize();
    5012             : 
    5013             :   // If we have a definitive size, we can use it. Otherwise we need to inspect
    5014             :   // the operands to know the size.
    5015      732495 :   if (isFixedSize(MI))
    5016             :     return DescSize;
    5017             : 
    5018             :   // 4-byte instructions may have a 32-bit literal encoded after them. Check
    5019             :   // operands that coud ever be literals.
    5020      728896 :   if (isVALU(MI) || isSALU(MI)) {
    5021      547189 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    5022      547189 :     if (Src0Idx == -1)
    5023             :       return DescSize; // No operands.
    5024             : 
    5025      838802 :     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
    5026       31104 :       return DescSize + 4;
    5027             : 
    5028      388297 :     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    5029      388297 :     if (Src1Idx == -1)
    5030             :       return DescSize;
    5031             : 
    5032      384986 :     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
    5033       13375 :       return DescSize + 4;
    5034             : 
    5035      179118 :     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
    5036      179118 :     if (Src2Idx == -1)
    5037             :       return DescSize;
    5038             : 
    5039       57106 :     if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
    5040           0 :       return DescSize + 4;
    5041             : 
    5042             :     return DescSize;
    5043             :   }
    5044             : 
    5045             :   switch (Opc) {
    5046             :   case TargetOpcode::IMPLICIT_DEF:
    5047             :   case TargetOpcode::KILL:
    5048             :   case TargetOpcode::DBG_VALUE:
    5049             :   case TargetOpcode::EH_LABEL:
    5050             :     return 0;
    5051        1216 :   case TargetOpcode::BUNDLE:
    5052        1216 :     return getInstBundleSize(MI);
    5053        2902 :   case TargetOpcode::INLINEASM: {
    5054        2902 :     const MachineFunction *MF = MI.getParent()->getParent();
    5055        2902 :     const char *AsmStr = MI.getOperand(0).getSymbolName();
    5056        5804 :     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
    5057             :   }
    5058             :   default:
    5059             :     return DescSize;
    5060             :   }
    5061             : }
    5062             : 
    5063           0 : bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
    5064           0 :   if (!isFLAT(MI))
    5065             :     return false;
    5066             : 
    5067           0 :   if (MI.memoperands_empty())
    5068             :     return true;
    5069             : 
    5070           0 :   for (const MachineMemOperand *MMO : MI.memoperands()) {
    5071           0 :     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
    5072             :       return true;
    5073             :   }
    5074             :   return false;
    5075             : }
    5076             : 
    5077           0 : bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
    5078           0 :   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
    5079             : }
    5080             : 
    5081           0 : void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
    5082             :                                             MachineBasicBlock *IfEnd) const {
    5083           0 :   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
    5084             :   assert(TI != IfEntry->end());
    5085             : 
    5086             :   MachineInstr *Branch = &(*TI);
    5087           0 :   MachineFunction *MF = IfEntry->getParent();
    5088           0 :   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
    5089             : 
    5090           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    5091           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    5092             :     MachineInstr *SIIF =
    5093           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
    5094           0 :             .add(Branch->getOperand(0))
    5095           0 :             .add(Branch->getOperand(1));
    5096             :     MachineInstr *SIEND =
    5097           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
    5098           0 :             .addReg(DstReg);
    5099             : 
    5100           0 :     IfEntry->erase(TI);
    5101             :     IfEntry->insert(IfEntry->end(), SIIF);
    5102           0 :     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
    5103             :   }
    5104           0 : }
    5105             : 
    5106           0 : void SIInstrInfo::convertNonUniformLoopRegion(
    5107             :     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
    5108           0 :   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
    5109             :   // We expect 2 terminators, one conditional and one unconditional.
    5110             :   assert(TI != LoopEnd->end());
    5111             : 
    5112             :   MachineInstr *Branch = &(*TI);
    5113           0 :   MachineFunction *MF = LoopEnd->getParent();
    5114           0 :   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
    5115             : 
    5116           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    5117             : 
    5118           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    5119           0 :     unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    5120             :     MachineInstrBuilder HeaderPHIBuilder =
    5121           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
    5122             :     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
    5123             :                                           E = LoopEntry->pred_end();
    5124           0 :          PI != E; ++PI) {
    5125           0 :       if (*PI == LoopEnd) {
    5126           0 :         HeaderPHIBuilder.addReg(BackEdgeReg);
    5127             :       } else {
    5128             :         MachineBasicBlock *PMBB = *PI;
    5129           0 :         unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    5130           0 :         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
    5131             :                              ZeroReg, 0);
    5132           0 :         HeaderPHIBuilder.addReg(ZeroReg);
    5133             :       }
    5134           0 :       HeaderPHIBuilder.addMBB(*PI);
    5135             :     }
    5136           0 :     MachineInstr *HeaderPhi = HeaderPHIBuilder;
    5137           0 :     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
    5138           0 :                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
    5139           0 :                                   .addReg(DstReg)
    5140           0 :                                   .add(Branch->getOperand(0));
    5141             :     MachineInstr *SILOOP =
    5142           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
    5143           0 :             .addReg(BackEdgeReg)
    5144           0 :             .addMBB(LoopEntry);
    5145             : 
    5146             :     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
    5147           0 :     LoopEnd->erase(TI);
    5148             :     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
    5149             :     LoopEnd->insert(LoopEnd->end(), SILOOP);
    5150             :   }
    5151           0 : }
    5152             : 
    5153             : ArrayRef<std::pair<int, const char *>>
    5154           5 : SIInstrInfo::getSerializableTargetIndices() const {
    5155             :   static const std::pair<int, const char *> TargetIndices[] = {
    5156             :       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
    5157             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
    5158             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
    5159             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
    5160             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
    5161           5 :   return makeArrayRef(TargetIndices);
    5162             : }
    5163             : 
    5164             : /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
    5165             : /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
    5166             : ScheduleHazardRecognizer *
    5167       15844 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
    5168             :                                             const ScheduleDAG *DAG) const {
    5169       15844 :   return new GCNHazardRecognizer(DAG->MF);
    5170             : }
    5171             : 
    5172             : /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
    5173             : /// pass.
    5174             : ScheduleHazardRecognizer *
    5175       19910 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
    5176       19910 :   return new GCNHazardRecognizer(MF);
    5177             : }
    5178             : 
    5179             : std::pair<unsigned, unsigned>
    5180          32 : SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
    5181          32 :   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
    5182             : }
    5183             : 
    5184             : ArrayRef<std::pair<unsigned, const char *>>
    5185          45 : SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
    5186             :   static const std::pair<unsigned, const char *> TargetFlags[] = {
    5187             :     { MO_GOTPCREL, "amdgpu-gotprel" },
    5188             :     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
    5189             :     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
    5190             :     { MO_REL32_LO, "amdgpu-rel32-lo" },
    5191             :     { MO_REL32_HI, "amdgpu-rel32-hi" }
    5192             :   };
    5193             : 
    5194          45 :   return makeArrayRef(TargetFlags);
    5195             : }
    5196             : 
    5197       17320 : bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
    5198       31356 :   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
    5199       14036 :          MI.modifiesRegister(AMDGPU::EXEC, &RI);
    5200             : }
    5201             : 
    5202             : MachineInstrBuilder
    5203          97 : SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
    5204             :                            MachineBasicBlock::iterator I,
    5205             :                            const DebugLoc &DL,
    5206             :                            unsigned DestReg) const {
    5207          97 :   if (ST.hasAddNoCarry())
    5208          78 :     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
    5209             : 
    5210          58 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    5211          58 :   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    5212          58 :   MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
    5213             : 
    5214         116 :   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
    5215          58 :            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
    5216             : }
    5217             : 
    5218         127 : bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
    5219         127 :   switch (Opcode) {
    5220             :   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
    5221             :   case AMDGPU::SI_KILL_I1_TERMINATOR:
    5222             :     return true;
    5223         122 :   default:
    5224         122 :     return false;
    5225             :   }
    5226             : }
    5227             : 
    5228          84 : const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
    5229          84 :   switch (Opcode) {
    5230          52 :   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
    5231         104 :     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
    5232          32 :   case AMDGPU::SI_KILL_I1_PSEUDO:
    5233          64 :     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
    5234           0 :   default:
    5235           0 :     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
    5236             :   }
    5237             : }
    5238             : 
    5239       13962 : bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
    5240       13962 :   if (!isSMRD(MI))
    5241             :     return false;
    5242             : 
    5243             :   // Check that it is using a buffer resource.
    5244       13962 :   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
    5245       13962 :   if (Idx == -1) // e.g. s_memtime
    5246             :     return false;
    5247             : 
    5248       13951 :   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
    5249       13951 :   return RCID == AMDGPU::SReg_128RegClassID;
    5250             : }
    5251             : 
    5252             : // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
    5253             : enum SIEncodingFamily {
    5254             :   SI = 0,
    5255             :   VI = 1,
    5256             :   SDWA = 2,
    5257             :   SDWA9 = 3,
    5258             :   GFX80 = 4,
    5259             :   GFX9 = 5
    5260             : };
    5261             : 
    5262             : static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
    5263     1292382 :   switch (ST.getGeneration()) {
    5264             :   default:
    5265             :     break;
    5266             :   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
    5267             :   case AMDGPUSubtarget::SEA_ISLANDS:
    5268             :     return SIEncodingFamily::SI;
    5269      704185 :   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
    5270             :   case AMDGPUSubtarget::GFX9:
    5271             :     return SIEncodingFamily::VI;
    5272             :   }
    5273           0 :   llvm_unreachable("Unknown subtarget generation!");
    5274             : }
    5275             : 
    5276     1292382 : int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
    5277     1292382 :   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
    5278             : 
    5279     2584764 :   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
    5280       96220 :     ST.getGeneration() >= AMDGPUSubtarget::GFX9)
    5281             :     Gen = SIEncodingFamily::GFX9;
    5282             : 
    5283     1292382 :   if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
    5284        5417 :     Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
    5285             :                                                       : SIEncodingFamily::SDWA;
    5286             :   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
    5287             :   // subtarget has UnpackedD16VMem feature.
    5288             :   // TODO: remove this when we discard GFX80 encoding.
    5289     1292382 :   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
    5290             :     Gen = SIEncodingFamily::GFX80;
    5291             : 
    5292     1292382 :   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
    5293             : 
    5294             :   // -1 means that Opcode is already a native instruction.
    5295     1292382 :   if (MCOp == -1)
    5296             :     return Opcode;
    5297             : 
    5298             :   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
    5299             :   // no encoding in the given subtarget generation.
    5300     1081655 :   if (MCOp == (uint16_t)-1)
    5301       23140 :     return -1;
    5302             : 
    5303             :   return MCOp;
    5304             : }

Generated by: LCOV version 1.13