LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInstrInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1785 2243 79.6 %
Date: 2017-09-14 15:23:50 Functions: 98 112 87.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief SI Implementation of TargetInstrInfo.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIInstrInfo.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "GCNHazardRecognizer.h"
      19             : #include "SIDefines.h"
      20             : #include "SIMachineFunctionInfo.h"
      21             : #include "SIRegisterInfo.h"
      22             : #include "Utils/AMDGPUBaseInfo.h"
      23             : #include "llvm/ADT/APInt.h"
      24             : #include "llvm/ADT/ArrayRef.h"
      25             : #include "llvm/ADT/SmallVector.h"
      26             : #include "llvm/ADT/StringRef.h"
      27             : #include "llvm/ADT/iterator_range.h"
      28             : #include "llvm/Analysis/AliasAnalysis.h"
      29             : #include "llvm/Analysis/MemoryLocation.h"
      30             : #include "llvm/CodeGen/MachineBasicBlock.h"
      31             : #include "llvm/CodeGen/MachineFrameInfo.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineInstr.h"
      34             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      35             : #include "llvm/CodeGen/MachineInstrBundle.h"
      36             : #include "llvm/CodeGen/MachineMemOperand.h"
      37             : #include "llvm/CodeGen/MachineOperand.h"
      38             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      39             : #include "llvm/CodeGen/MachineValueType.h"
      40             : #include "llvm/CodeGen/RegisterScavenging.h"
      41             : #include "llvm/CodeGen/ScheduleDAG.h"
      42             : #include "llvm/CodeGen/SelectionDAGNodes.h"
      43             : #include "llvm/IR/DebugLoc.h"
      44             : #include "llvm/IR/DiagnosticInfo.h"
      45             : #include "llvm/IR/Function.h"
      46             : #include "llvm/IR/InlineAsm.h"
      47             : #include "llvm/IR/LLVMContext.h"
      48             : #include "llvm/MC/MCInstrDesc.h"
      49             : #include "llvm/Support/Casting.h"
      50             : #include "llvm/Support/CommandLine.h"
      51             : #include "llvm/Support/Compiler.h"
      52             : #include "llvm/Support/ErrorHandling.h"
      53             : #include "llvm/Support/MathExtras.h"
      54             : #include "llvm/Target/TargetMachine.h"
      55             : #include "llvm/Target/TargetOpcodes.h"
      56             : #include "llvm/Target/TargetRegisterInfo.h"
      57             : #include <cassert>
      58             : #include <cstdint>
      59             : #include <iterator>
      60             : #include <utility>
      61             : 
      62             : using namespace llvm;
      63             : 
      64             : // Must be at least 4 to be able to branch over minimum unconditional branch
      65             : // code. This is only for making it possible to write reasonably small tests for
      66             : // long branches.
      67             : static cl::opt<unsigned>
      68      289224 : BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
      69      289224 :                  cl::desc("Restrict range of branch instructions (DEBUG)"));
      70             : 
      71        1796 : SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
      72        1796 :   : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
      73             : 
      74             : //===----------------------------------------------------------------------===//
      75             : // TargetInstrInfo callbacks
      76             : //===----------------------------------------------------------------------===//
      77             : 
      78             : static unsigned getNumOperandsNoGlue(SDNode *Node) {
      79      917352 :   unsigned N = Node->getNumOperands();
      80     2044028 :   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
      81             :     --N;
      82             :   return N;
      83             : }
      84             : 
      85      458676 : static SDValue findChainOperand(SDNode *Load) {
      86      917352 :   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
      87             :   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
      88      458676 :   return LastOp;
      89             : }
      90             : 
      91             : /// \brief Returns true if both nodes have the same value for the given
      92             : ///        operand \p Op, or if both nodes do not have this operand.
      93      612815 : static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
      94     1225630 :   unsigned Opc0 = N0->getMachineOpcode();
      95     1225630 :   unsigned Opc1 = N1->getMachineOpcode();
      96             : 
      97      612815 :   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
      98      612815 :   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
      99             : 
     100      612815 :   if (Op0Idx == -1 && Op1Idx == -1)
     101             :     return true;
     102             : 
     103             : 
     104      598863 :   if ((Op0Idx == -1 && Op1Idx != -1) ||
     105             :       (Op1Idx == -1 && Op0Idx != -1))
     106             :     return false;
     107             : 
     108             :   // getNamedOperandIdx returns the index for the MachineInstr's operands,
     109             :   // which includes the result as the first operand. We are indexing into the
     110             :   // MachineSDNode's operands, so we need to skip the result operand to get
     111             :   // the real index.
     112      598811 :   --Op0Idx;
     113      598811 :   --Op1Idx;
     114             : 
     115     1796433 :   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
     116             : }
     117             : 
     118       26328 : bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
     119             :                                                     AliasAnalysis *AA) const {
     120             :   // TODO: The generic check fails for VALU instructions that should be
     121             :   // rematerializable due to implicit reads of exec. We really want all of the
     122             :   // generic logic for this except for this.
     123       26328 :   switch (MI.getOpcode()) {
     124             :   case AMDGPU::V_MOV_B32_e32:
     125             :   case AMDGPU::V_MOV_B32_e64:
     126             :   case AMDGPU::V_MOV_B64_PSEUDO:
     127             :     return true;
     128       19938 :   default:
     129       19938 :     return false;
     130             :   }
     131             : }
     132             : 
     133      410560 : bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     134             :                                           int64_t &Offset0,
     135             :                                           int64_t &Offset1) const {
     136      410560 :   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
     137             :     return false;
     138             : 
     139      728500 :   unsigned Opc0 = Load0->getMachineOpcode();
     140      728500 :   unsigned Opc1 = Load1->getMachineOpcode();
     141             : 
     142             :   // Make sure both are actually loads.
     143     1821250 :   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
     144             :     return false;
     145             : 
     146      648032 :   if (isDS(Opc0) && isDS(Opc1)) {
     147             : 
     148             :     // FIXME: Handle this case:
     149           0 :     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
     150             :       return false;
     151             : 
     152             :     // Check base reg.
     153           0 :     if (Load0->getOperand(1) != Load1->getOperand(1))
     154             :       return false;
     155             : 
     156             :     // Check chain.
     157           0 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     158             :       return false;
     159             : 
     160             :     // Skip read2 / write2 variants for simplicity.
     161             :     // TODO: We should report true if the used offsets are adjacent (excluded
     162             :     // st64 versions).
     163           0 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
     164           0 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
     165             :       return false;
     166             : 
     167           0 :     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
     168           0 :     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
     169           0 :     return true;
     170             :   }
     171             : 
     172      752242 :   if (isSMRD(Opc0) && isSMRD(Opc1)) {
     173             :     // Skip time and cache invalidation instructions.
     174       71828 :     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
     175       35913 :         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
     176             :       return false;
     177             : 
     178             :     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
     179             : 
     180             :     // Check base reg.
     181      139882 :     if (Load0->getOperand(0) != Load1->getOperand(0))
     182             :       return false;
     183             : 
     184             :     const ConstantSDNode *Load0Offset =
     185       96447 :         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
     186             :     const ConstantSDNode *Load1Offset =
     187       96447 :         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
     188             : 
     189       32149 :     if (!Load0Offset || !Load1Offset)
     190             :       return false;
     191             : 
     192             :     // Check chain.
     193       64286 :     if (findChainOperand(Load0) != findChainOperand(Load1))
     194             :       return false;
     195             : 
     196       32143 :     Offset0 = Load0Offset->getZExtValue();
     197       32143 :     Offset1 = Load1Offset->getZExtValue();
     198       32143 :     return true;
     199             :   }
     200             : 
     201             :   // MUBUF and MTBUF can access the same addresses.
     202      932671 :   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
     203             : 
     204             :     // MUBUF and MTBUF have vaddr at different indices.
     205      219733 :     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
     206      811318 :         findChainOperand(Load0) != findChainOperand(Load1) ||
     207      612815 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
     208      195887 :         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
     209             :       return false;
     210             : 
     211      194339 :     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
     212      194339 :     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
     213             : 
     214      194339 :     if (OffIdx0 == -1 || OffIdx1 == -1)
     215             :       return false;
     216             : 
     217             :     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
     218             :     // inlcude the output in the operand list, but SDNodes don't, we need to
     219             :     // subtract the index by one.
     220      194339 :     --OffIdx0;
     221      194339 :     --OffIdx1;
     222             : 
     223      388678 :     SDValue Off0 = Load0->getOperand(OffIdx0);
     224      388678 :     SDValue Off1 = Load1->getOperand(OffIdx1);
     225             : 
     226             :     // The offset might be a FrameIndexSDNode.
     227             :     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
     228             :       return false;
     229             : 
     230      388678 :     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
     231      388678 :     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
     232      194339 :     return true;
     233             :   }
     234             : 
     235             :   return false;
     236             : }
     237             : 
     238             : static bool isStride64(unsigned Opc) {
     239       31943 :   switch (Opc) {
     240             :   case AMDGPU::DS_READ2ST64_B32:
     241             :   case AMDGPU::DS_READ2ST64_B64:
     242             :   case AMDGPU::DS_WRITE2ST64_B32:
     243             :   case AMDGPU::DS_WRITE2ST64_B64:
     244             :     return true;
     245             :   default:
     246             :     return false;
     247             :   }
     248             : }
     249             : 
     250      971949 : bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
     251             :                                         int64_t &Offset,
     252             :                                         const TargetRegisterInfo *TRI) const {
     253     1943898 :   unsigned Opc = LdSt.getOpcode();
     254             : 
     255      971949 :   if (isDS(LdSt)) {
     256       68312 :     const MachineOperand *OffsetImm =
     257             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     258       68312 :     if (OffsetImm) {
     259             :       // Normal, single offset LDS instruction.
     260       28060 :       const MachineOperand *AddrReg =
     261             :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     262             : 
     263       28060 :       BaseReg = AddrReg->getReg();
     264       28060 :       Offset = OffsetImm->getImm();
     265       28060 :       return true;
     266             :     }
     267             : 
     268             :     // The 2 offset instructions use offset0 and offset1 instead. We can treat
     269             :     // these as a load with a single offset if the 2 offsets are consecutive. We
     270             :     // will use this for some partially aligned loads.
     271       40252 :     const MachineOperand *Offset0Imm =
     272             :         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
     273       40252 :     const MachineOperand *Offset1Imm =
     274             :         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
     275             : 
     276       40252 :     uint8_t Offset0 = Offset0Imm->getImm();
     277       40252 :     uint8_t Offset1 = Offset1Imm->getImm();
     278             : 
     279       40252 :     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
     280             :       // Each of these offsets is in element sized units, so we need to convert
     281             :       // to bytes of the individual reads.
     282             : 
     283             :       unsigned EltSize;
     284       31943 :       if (LdSt.mayLoad())
     285       15870 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
     286             :       else {
     287             :         assert(LdSt.mayStore());
     288       24008 :         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
     289       48016 :         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
     290             :       }
     291             : 
     292           5 :       if (isStride64(Opc))
     293           5 :         EltSize *= 64;
     294             : 
     295       31943 :       const MachineOperand *AddrReg =
     296             :           getNamedOperand(LdSt, AMDGPU::OpName::addr);
     297       31943 :       BaseReg = AddrReg->getReg();
     298       31943 :       Offset = EltSize * Offset0;
     299       31943 :       return true;
     300             :     }
     301             : 
     302             :     return false;
     303             :   }
     304             : 
     305      998629 :   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     306      808759 :     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
     307     1617514 :     if (SOffset && SOffset->isReg())
     308             :       return false;
     309             : 
     310       93233 :     const MachineOperand *AddrReg =
     311             :         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     312       93233 :     if (!AddrReg)
     313             :       return false;
     314             : 
     315        4933 :     const MachineOperand *OffsetImm =
     316             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     317        4933 :     BaseReg = AddrReg->getReg();
     318        4933 :     Offset = OffsetImm->getImm();
     319             : 
     320        4933 :     if (SOffset) // soffset can be an inline immediate.
     321        4933 :       Offset += SOffset->getImm();
     322             : 
     323             :     return true;
     324             :   }
     325             : 
     326       94878 :   if (isSMRD(LdSt)) {
     327       29710 :     const MachineOperand *OffsetImm =
     328             :         getNamedOperand(LdSt, AMDGPU::OpName::offset);
     329       29710 :     if (!OffsetImm)
     330             :       return false;
     331             : 
     332       29651 :     const MachineOperand *SBaseReg =
     333             :         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
     334       29651 :     BaseReg = SBaseReg->getReg();
     335       29651 :     Offset = OffsetImm->getImm();
     336       29651 :     return true;
     337             :   }
     338             : 
     339       65168 :   if (isFLAT(LdSt)) {
     340       63656 :     const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     341       63656 :     if (VAddr) {
     342             :       // Can't analyze 2 offsets.
     343       63656 :       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
     344             :         return false;
     345             : 
     346       63656 :       BaseReg = VAddr->getReg();
     347             :     } else {
     348             :       // scratch instructions have either vaddr or saddr.
     349           0 :       BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
     350             :     }
     351             : 
     352       63656 :     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
     353       63656 :     return true;
     354             :   }
     355             : 
     356             :   return false;
     357             : }
     358             : 
     359       17601 : bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
     360             :                                       MachineInstr &SecondLdSt,
     361             :                                       unsigned NumLoads) const {
     362       17601 :   const MachineOperand *FirstDst = nullptr;
     363       17601 :   const MachineOperand *SecondDst = nullptr;
     364             : 
     365       35202 :   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
     366       52491 :       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
     367       17481 :       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     368         192 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     369         192 :     if (!FirstDst)
     370          20 :       FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     371         192 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
     372         192 :     if (!SecondDst)
     373          20 :       SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     374       33348 :   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
     375       15939 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
     376       15939 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
     377        2940 :   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
     378        1470 :     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
     379        1470 :     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
     380             :   }
     381             : 
     382       17601 :   if (!FirstDst || !SecondDst)
     383             :     return false;
     384             : 
     385             :   // Try to limit clustering based on the total number of bytes loaded
     386             :   // rather than the number of instructions.  This is done to help reduce
     387             :   // register pressure.  The method used is somewhat inexact, though,
     388             :   // because it assumes that all loads in the cluster will load the
     389             :   // same number of bytes as FirstLdSt.
     390             : 
     391             :   // The unit of this value is bytes.
     392             :   // FIXME: This needs finer tuning.
     393       17144 :   unsigned LoadClusterThreshold = 16;
     394             : 
     395             :   const MachineRegisterInfo &MRI =
     396       17144 :       FirstLdSt.getParent()->getParent()->getRegInfo();
     397       34288 :   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
     398             : 
     399       34288 :   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
     400             : }
     401             : 
     402          10 : static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
     403             :                               MachineBasicBlock::iterator MI,
     404             :                               const DebugLoc &DL, unsigned DestReg,
     405             :                               unsigned SrcReg, bool KillSrc) {
     406          10 :   MachineFunction *MF = MBB.getParent();
     407          10 :   DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
     408             :                                         "illegal SGPR to VGPR copy",
     409          30 :                                         DL, DS_Error);
     410          10 :   LLVMContext &C = MF->getFunction()->getContext();
     411          10 :   C.diagnose(IllegalCopy);
     412             : 
     413          30 :   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
     414          10 :     .addReg(SrcReg, getKillRegState(KillSrc));
     415          10 : }
     416             : 
     417       39827 : void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     418             :                               MachineBasicBlock::iterator MI,
     419             :                               const DebugLoc &DL, unsigned DestReg,
     420             :                               unsigned SrcReg, bool KillSrc) const {
     421       39827 :   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
     422             : 
     423       39827 :   if (RC == &AMDGPU::VGPR_32RegClass) {
     424             :     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
     425             :            AMDGPU::SReg_32RegClass.contains(SrcReg));
     426       71259 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     427       23753 :       .addReg(SrcReg, getKillRegState(KillSrc));
     428       23753 :     return;
     429             :   }
     430             : 
     431       16074 :   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
     432             :       RC == &AMDGPU::SReg_32RegClass) {
     433       10375 :     if (SrcReg == AMDGPU::SCC) {
     434           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
     435           0 :           .addImm(-1)
     436           0 :           .addImm(0);
     437           0 :       return;
     438             :     }
     439             : 
     440       20750 :     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
     441           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     442           2 :       return;
     443             :     }
     444             : 
     445       31119 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     446       10373 :             .addReg(SrcReg, getKillRegState(KillSrc));
     447       10373 :     return;
     448             :   }
     449             : 
     450        5699 :   if (RC == &AMDGPU::SReg_64RegClass) {
     451        1455 :     if (DestReg == AMDGPU::VCC) {
     452          46 :       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     453          45 :         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
     454          15 :           .addReg(SrcReg, getKillRegState(KillSrc));
     455             :       } else {
     456             :         // FIXME: Hack until VReg_1 removed.
     457             :         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
     458          24 :         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
     459           8 :           .addImm(0)
     460           8 :           .addReg(SrcReg, getKillRegState(KillSrc));
     461             :       }
     462             : 
     463             :       return;
     464             :     }
     465             : 
     466        2862 :     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
     467           2 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     468           2 :       return;
     469             :     }
     470             : 
     471        4290 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     472        1430 :             .addReg(SrcReg, getKillRegState(KillSrc));
     473        1430 :     return;
     474             :   }
     475             : 
     476        4244 :   if (DestReg == AMDGPU::SCC) {
     477             :     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
     478           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
     479           0 :       .addReg(SrcReg, getKillRegState(KillSrc))
     480           0 :       .addImm(0);
     481           0 :     return;
     482             :   }
     483             : 
     484        4244 :   unsigned EltSize = 4;
     485        4244 :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     486        8488 :   if (RI.isSGPRClass(RC)) {
     487         284 :     if (RI.getRegSizeInBits(*RC) > 32) {
     488             :       Opcode =  AMDGPU::S_MOV_B64;
     489             :       EltSize = 8;
     490             :     } else {
     491           0 :       Opcode = AMDGPU::S_MOV_B32;
     492           0 :       EltSize = 4;
     493             :     }
     494             : 
     495         284 :     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
     496           6 :       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
     497           6 :       return;
     498             :     }
     499             :   }
     500             : 
     501        4238 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
     502       12714 :   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
     503             : 
     504       13094 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     505             :     unsigned SubIdx;
     506        8856 :     if (Forward)
     507       13976 :       SubIdx = SubIndices[Idx];
     508             :     else
     509        3736 :       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
     510             : 
     511             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     512       17712 :       get(Opcode), RI.getSubReg(DestReg, SubIdx));
     513             : 
     514        8856 :     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
     515             : 
     516        8856 :     if (Idx == 0)
     517        4238 :       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
     518             : 
     519        8856 :     bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
     520        8856 :     Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
     521             :   }
     522             : }
     523             : 
     524      241209 : int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
     525             :   int NewOpc;
     526             : 
     527             :   // Try to map original to commuted opcode
     528      241209 :   NewOpc = AMDGPU::getCommuteRev(Opcode);
     529      241209 :   if (NewOpc != -1)
     530             :     // Check if the commuted (REV) opcode exists on the target.
     531       22940 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     532             : 
     533             :   // Try to map commuted to original opcode
     534      218269 :   NewOpc = AMDGPU::getCommuteOrig(Opcode);
     535      218269 :   if (NewOpc != -1)
     536             :     // Check if the original (non-REV) opcode exists on the target.
     537       39182 :     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
     538             : 
     539      179087 :   return Opcode;
     540             : }
     541             : 
     542           0 : void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
     543             :                                        MachineBasicBlock::iterator MI,
     544             :                                        const DebugLoc &DL, unsigned DestReg,
     545             :                                        int64_t Value) const {
     546           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     547           0 :   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
     548           0 :   if (RegClass == &AMDGPU::SReg_32RegClass ||
     549           0 :       RegClass == &AMDGPU::SGPR_32RegClass ||
     550           0 :       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
     551             :       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
     552           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
     553           0 :       .addImm(Value);
     554           0 :     return;
     555             :   }
     556             : 
     557           0 :   if (RegClass == &AMDGPU::SReg_64RegClass ||
     558           0 :       RegClass == &AMDGPU::SGPR_64RegClass ||
     559             :       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
     560           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
     561           0 :       .addImm(Value);
     562           0 :     return;
     563             :   }
     564             : 
     565           0 :   if (RegClass == &AMDGPU::VGPR_32RegClass) {
     566           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
     567           0 :       .addImm(Value);
     568           0 :     return;
     569             :   }
     570           0 :   if (RegClass == &AMDGPU::VReg_64RegClass) {
     571           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
     572           0 :       .addImm(Value);
     573           0 :     return;
     574             :   }
     575             : 
     576           0 :   unsigned EltSize = 4;
     577           0 :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
     578           0 :   if (RI.isSGPRClass(RegClass)) {
     579           0 :     if (RI.getRegSizeInBits(*RegClass) > 32) {
     580             :       Opcode =  AMDGPU::S_MOV_B64;
     581             :       EltSize = 8;
     582             :     } else {
     583           0 :       Opcode = AMDGPU::S_MOV_B32;
     584           0 :       EltSize = 4;
     585             :     }
     586             :   }
     587             : 
     588           0 :   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
     589           0 :   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
     590           0 :     int64_t IdxValue = Idx == 0 ? Value : 0;
     591             : 
     592             :     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
     593           0 :       get(Opcode), RI.getSubReg(DestReg, Idx));
     594           0 :     Builder.addImm(IdxValue);
     595             :   }
     596             : }
     597             : 
     598             : const TargetRegisterClass *
     599           0 : SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
     600           0 :   return &AMDGPU::VGPR_32RegClass;
     601             : }
     602             : 
     603           0 : void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     604             :                                      MachineBasicBlock::iterator I,
     605             :                                      const DebugLoc &DL, unsigned DstReg,
     606             :                                      ArrayRef<MachineOperand> Cond,
     607             :                                      unsigned TrueReg,
     608             :                                      unsigned FalseReg) const {
     609           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
     610             :   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
     611             :          "Not a VGPR32 reg");
     612             : 
     613           0 :   if (Cond.size() == 1) {
     614           0 :     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     615           0 :       .addReg(FalseReg)
     616           0 :       .addReg(TrueReg)
     617           0 :       .add(Cond[0]);
     618           0 :   } else if (Cond.size() == 2) {
     619             :     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
     620           0 :     switch (Cond[0].getImm()) {
     621           0 :     case SIInstrInfo::SCC_TRUE: {
     622           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     623           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     624           0 :         .addImm(-1)
     625           0 :         .addImm(0);
     626           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     627           0 :         .addReg(FalseReg)
     628           0 :         .addReg(TrueReg)
     629           0 :         .addReg(SReg);
     630           0 :       break;
     631             :     }
     632           0 :     case SIInstrInfo::SCC_FALSE: {
     633           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     634           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     635           0 :         .addImm(0)
     636           0 :         .addImm(-1);
     637           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     638           0 :         .addReg(FalseReg)
     639           0 :         .addReg(TrueReg)
     640           0 :         .addReg(SReg);
     641           0 :       break;
     642             :     }
     643           0 :     case SIInstrInfo::VCCNZ: {
     644           0 :       MachineOperand RegOp = Cond[1];
     645           0 :       RegOp.setImplicit(false);
     646           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     647           0 :           .addReg(FalseReg)
     648           0 :           .addReg(TrueReg)
     649           0 :           .add(RegOp);
     650             :       break;
     651             :     }
     652           0 :     case SIInstrInfo::VCCZ: {
     653           0 :       MachineOperand RegOp = Cond[1];
     654           0 :       RegOp.setImplicit(false);
     655           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     656           0 :           .addReg(TrueReg)
     657           0 :           .addReg(FalseReg)
     658           0 :           .add(RegOp);
     659             :       break;
     660             :     }
     661           0 :     case SIInstrInfo::EXECNZ: {
     662           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     663           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     664           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     665           0 :         .addImm(0);
     666           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     667           0 :         .addImm(-1)
     668           0 :         .addImm(0);
     669           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     670           0 :         .addReg(FalseReg)
     671           0 :         .addReg(TrueReg)
     672           0 :         .addReg(SReg);
     673           0 :       break;
     674             :     }
     675           0 :     case SIInstrInfo::EXECZ: {
     676           0 :       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     677           0 :       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     678           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
     679           0 :         .addImm(0);
     680           0 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
     681           0 :         .addImm(0)
     682           0 :         .addImm(-1);
     683           0 :       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
     684           0 :         .addReg(FalseReg)
     685           0 :         .addReg(TrueReg)
     686           0 :         .addReg(SReg);
     687           0 :       llvm_unreachable("Unhandled branch predicate EXECZ");
     688             :       break;
     689             :     }
     690           0 :     default:
     691           0 :       llvm_unreachable("invalid branch predicate");
     692             :     }
     693             :   } else {
     694           0 :     llvm_unreachable("Can only handle Cond size 1 or 2");
     695             :   }
     696           0 : }
     697             : 
     698           0 : unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
     699             :                                MachineBasicBlock::iterator I,
     700             :                                const DebugLoc &DL,
     701             :                                unsigned SrcReg, int Value) const {
     702           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     703           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     704           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
     705           0 :     .addImm(Value)
     706           0 :     .addReg(SrcReg);
     707             : 
     708           0 :   return Reg;
     709             : }
     710             : 
     711           0 : unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
     712             :                                MachineBasicBlock::iterator I,
     713             :                                const DebugLoc &DL,
     714             :                                unsigned SrcReg, int Value) const {
     715           0 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     716           0 :   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     717           0 :   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
     718           0 :     .addImm(Value)
     719           0 :     .addReg(SrcReg);
     720             : 
     721           0 :   return Reg;
     722             : }
     723             : 
     724        8536 : unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
     725             : 
     726       17072 :   if (RI.getRegSizeInBits(*DstRC) == 32) {
     727       16484 :     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     728         588 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
     729             :     return AMDGPU::S_MOV_B64;
     730         861 :   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
     731             :     return  AMDGPU::V_MOV_B64_PSEUDO;
     732             :   }
     733             :   return AMDGPU::COPY;
     734             : }
     735             : 
     736             : static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
     737         603 :   switch (Size) {
     738             :   case 4:
     739             :     return AMDGPU::SI_SPILL_S32_SAVE;
     740          85 :   case 8:
     741             :     return AMDGPU::SI_SPILL_S64_SAVE;
     742          19 :   case 16:
     743             :     return AMDGPU::SI_SPILL_S128_SAVE;
     744          27 :   case 32:
     745             :     return AMDGPU::SI_SPILL_S256_SAVE;
     746           8 :   case 64:
     747             :     return AMDGPU::SI_SPILL_S512_SAVE;
     748           0 :   default:
     749           0 :     llvm_unreachable("unknown register size");
     750             :   }
     751             : }
     752             : 
     753             : static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     754        1086 :   switch (Size) {
     755             :   case 4:
     756             :     return AMDGPU::SI_SPILL_V32_SAVE;
     757          13 :   case 8:
     758             :     return AMDGPU::SI_SPILL_V64_SAVE;
     759           0 :   case 12:
     760             :     return AMDGPU::SI_SPILL_V96_SAVE;
     761         657 :   case 16:
     762             :     return AMDGPU::SI_SPILL_V128_SAVE;
     763           0 :   case 32:
     764             :     return AMDGPU::SI_SPILL_V256_SAVE;
     765           0 :   case 64:
     766             :     return AMDGPU::SI_SPILL_V512_SAVE;
     767           0 :   default:
     768           0 :     llvm_unreachable("unknown register size");
     769             :   }
     770             : }
     771             : 
     772        1689 : void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     773             :                                       MachineBasicBlock::iterator MI,
     774             :                                       unsigned SrcReg, bool isKill,
     775             :                                       int FrameIndex,
     776             :                                       const TargetRegisterClass *RC,
     777             :                                       const TargetRegisterInfo *TRI) const {
     778        1689 :   MachineFunction *MF = MBB.getParent();
     779        1689 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     780        1689 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     781        2775 :   DebugLoc DL = MBB.findDebugLoc(MI);
     782             : 
     783        1689 :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     784        1689 :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     785             :   MachinePointerInfo PtrInfo
     786        1689 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     787             :   MachineMemOperand *MMO
     788        3378 :     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
     789        1689 :                                Size, Align);
     790        3378 :   unsigned SpillSize = TRI->getSpillSize(*RC);
     791             : 
     792        3378 :   if (RI.isSGPRClass(RC)) {
     793         603 :     MFI->setHasSpilledSGPRs();
     794             : 
     795             :     // We are only allowed to create one new instruction when spilling
     796             :     // registers, so we need to use pseudo instruction for spilling SGPRs.
     797        1809 :     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
     798             : 
     799             :     // The SGPR spill/restore instructions only work on number sgprs, so we need
     800             :     // to make sure we are using the correct register class.
     801         603 :     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
     802         140 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     803         140 :       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     804             :     }
     805             : 
     806        1206 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
     807         603 :       .addReg(SrcReg, getKillRegState(isKill)) // data
     808         603 :       .addFrameIndex(FrameIndex)               // addr
     809         603 :       .addMemOperand(MMO)
     810         603 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
     811         603 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
     812             :     // Add the scratch resource registers as implicit uses because we may end up
     813             :     // needing them, and need to ensure that the reserved registers are
     814             :     // correctly handled.
     815             : 
     816         603 :     FrameInfo.setStackID(FrameIndex, 1);
     817         603 :     if (ST.hasScalarStores()) {
     818             :       // m0 is used for offset to scalar stores if used to spill.
     819         298 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     820             :     }
     821             : 
     822             :     return;
     823             :   }
     824             : 
     825        1086 :   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     826           0 :     LLVMContext &Ctx = MF->getFunction()->getContext();
     827           0 :     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
     828             :                   " spill register");
     829           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
     830           0 :       .addReg(SrcReg);
     831             : 
     832           0 :     return;
     833             :   }
     834             : 
     835             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
     836             : 
     837        1086 :   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
     838        1086 :   MFI->setHasSpilledVGPRs();
     839        3258 :   BuildMI(MBB, MI, DL, get(Opcode))
     840        1086 :     .addReg(SrcReg, getKillRegState(isKill)) // data
     841        1086 :     .addFrameIndex(FrameIndex)               // addr
     842        1086 :     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
     843        1086 :     .addReg(MFI->getFrameOffsetReg())        // scratch_offset
     844        1086 :     .addImm(0)                               // offset
     845        1086 :     .addMemOperand(MMO);
     846             : }
     847             : 
     848             : static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
     849         590 :   switch (Size) {
     850             :   case 4:
     851             :     return AMDGPU::SI_SPILL_S32_RESTORE;
     852          79 :   case 8:
     853             :     return AMDGPU::SI_SPILL_S64_RESTORE;
     854          18 :   case 16:
     855             :     return AMDGPU::SI_SPILL_S128_RESTORE;
     856          27 :   case 32:
     857             :     return AMDGPU::SI_SPILL_S256_RESTORE;
     858           8 :   case 64:
     859             :     return AMDGPU::SI_SPILL_S512_RESTORE;
     860           0 :   default:
     861           0 :     llvm_unreachable("unknown register size");
     862             :   }
     863             : }
     864             : 
     865             : static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     866         999 :   switch (Size) {
     867             :   case 4:
     868             :     return AMDGPU::SI_SPILL_V32_RESTORE;
     869          13 :   case 8:
     870             :     return AMDGPU::SI_SPILL_V64_RESTORE;
     871           0 :   case 12:
     872             :     return AMDGPU::SI_SPILL_V96_RESTORE;
     873         661 :   case 16:
     874             :     return AMDGPU::SI_SPILL_V128_RESTORE;
     875           0 :   case 32:
     876             :     return AMDGPU::SI_SPILL_V256_RESTORE;
     877           0 :   case 64:
     878             :     return AMDGPU::SI_SPILL_V512_RESTORE;
     879           0 :   default:
     880           0 :     llvm_unreachable("unknown register size");
     881             :   }
     882             : }
     883             : 
     884        1589 : void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     885             :                                        MachineBasicBlock::iterator MI,
     886             :                                        unsigned DestReg, int FrameIndex,
     887             :                                        const TargetRegisterClass *RC,
     888             :                                        const TargetRegisterInfo *TRI) const {
     889        1589 :   MachineFunction *MF = MBB.getParent();
     890        1589 :   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     891        1589 :   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
     892        2588 :   DebugLoc DL = MBB.findDebugLoc(MI);
     893        1589 :   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
     894        1589 :   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
     895        3178 :   unsigned SpillSize = TRI->getSpillSize(*RC);
     896             : 
     897             :   MachinePointerInfo PtrInfo
     898        1589 :     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
     899             : 
     900        3178 :   MachineMemOperand *MMO = MF->getMachineMemOperand(
     901        1589 :     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
     902             : 
     903        3178 :   if (RI.isSGPRClass(RC)) {
     904             :     // FIXME: Maybe this should not include a memoperand because it will be
     905             :     // lowered to non-memory instructions.
     906        1770 :     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
     907         590 :     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
     908         141 :       MachineRegisterInfo &MRI = MF->getRegInfo();
     909         141 :       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     910             :     }
     911             : 
     912         590 :     FrameInfo.setStackID(FrameIndex, 1);
     913        1180 :     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
     914         590 :       .addFrameIndex(FrameIndex) // addr
     915         590 :       .addMemOperand(MMO)
     916         590 :       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
     917         590 :       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
     918             : 
     919         590 :     if (ST.hasScalarStores()) {
     920             :       // m0 is used for offset to scalar stores if used to spill.
     921         293 :       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     922             :     }
     923             : 
     924             :     return;
     925             :   }
     926             : 
     927         999 :   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     928           0 :     LLVMContext &Ctx = MF->getFunction()->getContext();
     929           0 :     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
     930             :                   " restore register");
     931           0 :     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
     932             : 
     933           0 :     return;
     934             :   }
     935             : 
     936             :   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
     937             : 
     938         999 :   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
     939        2997 :   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     940         999 :     .addFrameIndex(FrameIndex)        // vaddr
     941         999 :     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
     942         999 :     .addReg(MFI->getFrameOffsetReg()) // scratch_offset
     943         999 :     .addImm(0)                        // offset
     944         999 :     .addMemOperand(MMO);
     945             : }
     946             : 
     947             : /// \param @Offset Offset in bytes of the FrameIndex being spilled
     948           0 : unsigned SIInstrInfo::calculateLDSSpillAddress(
     949             :     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
     950             :     unsigned FrameOffset, unsigned Size) const {
     951           0 :   MachineFunction *MF = MBB.getParent();
     952           0 :   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     953           0 :   const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
     954           0 :   DebugLoc DL = MBB.findDebugLoc(MI);
     955           0 :   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
     956           0 :   unsigned WavefrontSize = ST.getWavefrontSize();
     957             : 
     958           0 :   unsigned TIDReg = MFI->getTIDReg();
     959           0 :   if (!MFI->hasCalculatedTID()) {
     960           0 :     MachineBasicBlock &Entry = MBB.getParent()->front();
     961           0 :     MachineBasicBlock::iterator Insert = Entry.front();
     962           0 :     DebugLoc DL = Insert->getDebugLoc();
     963             : 
     964           0 :     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
     965             :                                    *MF);
     966           0 :     if (TIDReg == AMDGPU::NoRegister)
     967           0 :       return TIDReg;
     968             : 
     969           0 :     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
     970             :         WorkGroupSize > WavefrontSize) {
     971             :       unsigned TIDIGXReg
     972           0 :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
     973             :       unsigned TIDIGYReg
     974           0 :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
     975             :       unsigned TIDIGZReg
     976           0 :         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
     977             :       unsigned InputPtrReg =
     978           0 :           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
     979           0 :       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
     980           0 :         if (!Entry.isLiveIn(Reg))
     981           0 :           Entry.addLiveIn(Reg);
     982             :       }
     983             : 
     984           0 :       RS->enterBasicBlock(Entry);
     985             :       // FIXME: Can we scavenge an SReg_64 and access the subregs?
     986           0 :       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
     987           0 :       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
     988           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
     989           0 :               .addReg(InputPtrReg)
     990           0 :               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
     991           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
     992           0 :               .addReg(InputPtrReg)
     993           0 :               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
     994             : 
     995             :       // NGROUPS.X * NGROUPS.Y
     996           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
     997           0 :               .addReg(STmp1)
     998           0 :               .addReg(STmp0);
     999             :       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
    1000           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
    1001           0 :               .addReg(STmp1)
    1002           0 :               .addReg(TIDIGXReg);
    1003             :       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
    1004           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
    1005           0 :               .addReg(STmp0)
    1006           0 :               .addReg(TIDIGYReg)
    1007           0 :               .addReg(TIDReg);
    1008             :       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
    1009           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
    1010           0 :               .addReg(TIDReg)
    1011           0 :               .addReg(TIDIGZReg);
    1012             :     } else {
    1013             :       // Get the wave id
    1014           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
    1015           0 :               TIDReg)
    1016           0 :               .addImm(-1)
    1017           0 :               .addImm(0);
    1018             : 
    1019           0 :       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
    1020           0 :               TIDReg)
    1021           0 :               .addImm(-1)
    1022           0 :               .addReg(TIDReg);
    1023             :     }
    1024             : 
    1025           0 :     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
    1026           0 :             TIDReg)
    1027           0 :             .addImm(2)
    1028           0 :             .addReg(TIDReg);
    1029           0 :     MFI->setTIDReg(TIDReg);
    1030             :   }
    1031             : 
    1032             :   // Add FrameIndex to LDS offset
    1033           0 :   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
    1034           0 :   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
    1035           0 :           .addImm(LDSOffset)
    1036           0 :           .addReg(TIDReg);
    1037             : 
    1038           0 :   return TmpReg;
    1039             : }
    1040             : 
    1041        1570 : void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
    1042             :                                    MachineBasicBlock::iterator MI,
    1043             :                                    int Count) const {
    1044        1570 :   DebugLoc DL = MBB.findDebugLoc(MI);
    1045        4710 :   while (Count > 0) {
    1046             :     int Arg;
    1047        1570 :     if (Count >= 8)
    1048             :       Arg = 7;
    1049             :     else
    1050        1570 :       Arg = Count - 1;
    1051        1570 :     Count -= 8;
    1052        4710 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
    1053        3140 :             .addImm(Arg);
    1054             :   }
    1055        1570 : }
    1056             : 
    1057        1570 : void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
    1058             :                              MachineBasicBlock::iterator MI) const {
    1059        1570 :   insertWaitStates(MBB, MI, 1);
    1060        1570 : }
    1061             : 
    1062           0 : void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
    1063           0 :   auto MF = MBB.getParent();
    1064           0 :   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
    1065             : 
    1066             :   assert(Info->isEntryFunction());
    1067             : 
    1068           0 :   if (MBB.succ_empty()) {
    1069           0 :     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
    1070           0 :     if (HasNoTerminator)
    1071           0 :       BuildMI(MBB, MBB.end(), DebugLoc(),
    1072           0 :               get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
    1073             :   }
    1074           0 : }
    1075             : 
    1076      441989 : unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
    1077      883978 :   switch (MI.getOpcode()) {
    1078             :   default: return 1; // FIXME: Do wait states equal cycles?
    1079             : 
    1080        1297 :   case AMDGPU::S_NOP:
    1081        1297 :     return MI.getOperand(0).getImm() + 1;
    1082             :   }
    1083             : }
    1084             : 
    1085      242357 : bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
    1086      242357 :   MachineBasicBlock &MBB = *MI.getParent();
    1087      727071 :   DebugLoc DL = MBB.findDebugLoc(MI);
    1088      484714 :   switch (MI.getOpcode()) {
    1089             :   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
    1090           0 :   case AMDGPU::S_MOV_B64_term:
    1091             :     // This is only a terminator to get the correct spill code placement during
    1092             :     // register allocation.
    1093           0 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1094             :     break;
    1095             : 
    1096           0 :   case AMDGPU::S_XOR_B64_term:
    1097             :     // This is only a terminator to get the correct spill code placement during
    1098             :     // register allocation.
    1099           0 :     MI.setDesc(get(AMDGPU::S_XOR_B64));
    1100             :     break;
    1101             : 
    1102           0 :   case AMDGPU::S_ANDN2_B64_term:
    1103             :     // This is only a terminator to get the correct spill code placement during
    1104             :     // register allocation.
    1105           0 :     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
    1106             :     break;
    1107             : 
    1108         210 :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1109         210 :     unsigned Dst = MI.getOperand(0).getReg();
    1110         210 :     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
    1111         210 :     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
    1112             : 
    1113         210 :     const MachineOperand &SrcOp = MI.getOperand(1);
    1114             :     // FIXME: Will this work for 64-bit floating point immediates?
    1115             :     assert(!SrcOp.isFPImm());
    1116         210 :     if (SrcOp.isImm()) {
    1117         630 :       APInt Imm(64, SrcOp.getImm());
    1118         630 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1119         840 :         .addImm(Imm.getLoBits(32).getZExtValue())
    1120         210 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1121         630 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1122         840 :         .addImm(Imm.getHiBits(32).getZExtValue())
    1123         210 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1124             :     } else {
    1125             :       assert(SrcOp.isReg());
    1126           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    1127           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
    1128           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1129           0 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    1130           0 :         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
    1131           0 :         .addReg(Dst, RegState::Implicit | RegState::Define);
    1132             :     }
    1133         210 :     MI.eraseFromParent();
    1134         210 :     break;
    1135             :   }
    1136           6 :   case AMDGPU::V_SET_INACTIVE_B32: {
    1137          18 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1138           6 :       .addReg(AMDGPU::EXEC);
    1139          18 :     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
    1140          18 :       .add(MI.getOperand(2));
    1141          18 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1142           6 :       .addReg(AMDGPU::EXEC);
    1143           6 :     MI.eraseFromParent();
    1144           6 :     break;
    1145             :   }
    1146           2 :   case AMDGPU::V_SET_INACTIVE_B64: {
    1147           6 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1148           2 :       .addReg(AMDGPU::EXEC);
    1149           4 :     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
    1150           4 :                                  MI.getOperand(0).getReg())
    1151           6 :       .add(MI.getOperand(2));
    1152           2 :     expandPostRAPseudo(*Copy);
    1153           6 :     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
    1154           2 :       .addReg(AMDGPU::EXEC);
    1155           2 :     MI.eraseFromParent();
    1156           2 :     break;
    1157             :   }
    1158          64 :   case AMDGPU::V_MOVRELD_B32_V1:
    1159             :   case AMDGPU::V_MOVRELD_B32_V2:
    1160             :   case AMDGPU::V_MOVRELD_B32_V4:
    1161             :   case AMDGPU::V_MOVRELD_B32_V8:
    1162             :   case AMDGPU::V_MOVRELD_B32_V16: {
    1163         128 :     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
    1164          64 :     unsigned VecReg = MI.getOperand(0).getReg();
    1165         128 :     bool IsUndef = MI.getOperand(1).isUndef();
    1166          64 :     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
    1167             :     assert(VecReg == MI.getOperand(1).getReg());
    1168             : 
    1169             :     MachineInstr *MovRel =
    1170         128 :         BuildMI(MBB, MI, DL, MovRelDesc)
    1171          64 :             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
    1172         192 :             .add(MI.getOperand(2))
    1173          64 :             .addReg(VecReg, RegState::ImplicitDefine)
    1174             :             .addReg(VecReg,
    1175          64 :                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
    1176             : 
    1177             :     const int ImpDefIdx =
    1178          64 :         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
    1179          64 :     const int ImpUseIdx = ImpDefIdx + 1;
    1180          64 :     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
    1181             : 
    1182          64 :     MI.eraseFromParent();
    1183          64 :     break;
    1184             :   }
    1185         480 :   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
    1186         480 :     MachineFunction &MF = *MBB.getParent();
    1187         480 :     unsigned Reg = MI.getOperand(0).getReg();
    1188         480 :     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
    1189         480 :     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
    1190             : 
    1191             :     // Create a bundle so these instructions won't be re-ordered by the
    1192             :     // post-RA scheduler.
    1193         960 :     MIBundleBuilder Bundler(MBB, MI);
    1194        1440 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
    1195             : 
    1196             :     // Add 32-bit offset from this instruction to the start of the
    1197             :     // constant data.
    1198        1440 :     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
    1199         480 :                        .addReg(RegLo)
    1200        1440 :                        .add(MI.getOperand(1)));
    1201             : 
    1202        1440 :     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
    1203         480 :                                   .addReg(RegHi);
    1204         960 :     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
    1205             :       MIB.addImm(0);
    1206             :     else
    1207         922 :       MIB.add(MI.getOperand(2));
    1208             : 
    1209         960 :     Bundler.append(MIB);
    1210         480 :     finalizeBundle(MBB, Bundler.begin());
    1211             : 
    1212         480 :     MI.eraseFromParent();
    1213             :     break;
    1214             :   }
    1215          16 :   case AMDGPU::EXIT_WWM: {
    1216             :     // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
    1217             :     // is exited.
    1218          32 :     MI.setDesc(get(AMDGPU::S_MOV_B64));
    1219             :     break;
    1220             :   }
    1221             :   }
    1222             :   return true;
    1223             : }
    1224             : 
    1225      189725 : bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
    1226             :                                       MachineOperand &Src0,
    1227             :                                       unsigned Src0OpName,
    1228             :                                       MachineOperand &Src1,
    1229             :                                       unsigned Src1OpName) const {
    1230      189725 :   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
    1231      189725 :   if (!Src0Mods)
    1232             :     return false;
    1233             : 
    1234       36598 :   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
    1235             :   assert(Src1Mods &&
    1236             :          "All commutable instructions have both src0 and src1 modifiers");
    1237             : 
    1238       36598 :   int Src0ModsVal = Src0Mods->getImm();
    1239       36598 :   int Src1ModsVal = Src1Mods->getImm();
    1240             : 
    1241       73196 :   Src1Mods->setImm(Src0ModsVal);
    1242       73196 :   Src0Mods->setImm(Src1ModsVal);
    1243       36598 :   return true;
    1244             : }
    1245             : 
    1246       25016 : static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
    1247             :                                              MachineOperand &RegOp,
    1248             :                                              MachineOperand &NonRegOp) {
    1249       25016 :   unsigned Reg = RegOp.getReg();
    1250       25016 :   unsigned SubReg = RegOp.getSubReg();
    1251       25016 :   bool IsKill = RegOp.isKill();
    1252       25016 :   bool IsDead = RegOp.isDead();
    1253       25016 :   bool IsUndef = RegOp.isUndef();
    1254       25016 :   bool IsDebug = RegOp.isDebug();
    1255             : 
    1256       25016 :   if (NonRegOp.isImm())
    1257       25016 :     RegOp.ChangeToImmediate(NonRegOp.getImm());
    1258           0 :   else if (NonRegOp.isFI())
    1259           0 :     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
    1260             :   else
    1261             :     return nullptr;
    1262             : 
    1263       25016 :   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
    1264       25016 :   NonRegOp.setSubReg(SubReg);
    1265             : 
    1266       25016 :   return &MI;
    1267             : }
    1268             : 
    1269      232507 : MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
    1270             :                                                   unsigned Src0Idx,
    1271             :                                                   unsigned Src1Idx) const {
    1272             :   assert(!NewMI && "this should never be used");
    1273             : 
    1274      465014 :   unsigned Opc = MI.getOpcode();
    1275      232507 :   int CommutedOpcode = commuteOpcode(Opc);
    1276      232507 :   if (CommutedOpcode == -1)
    1277             :     return nullptr;
    1278             : 
    1279             :   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
    1280             :            static_cast<int>(Src0Idx) &&
    1281             :          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
    1282             :            static_cast<int>(Src1Idx) &&
    1283             :          "inconsistency with findCommutedOpIndices");
    1284             : 
    1285      440184 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    1286      440184 :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    1287             : 
    1288      220092 :   MachineInstr *CommutedMI = nullptr;
    1289      417354 :   if (Src0.isReg() && Src1.isReg()) {
    1290      183166 :     if (isOperandLegal(MI, Src1Idx, &Src0)) {
    1291             :       // Be sure to copy the source modifiers to the right place.
    1292             :       CommutedMI
    1293      164709 :         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
    1294             :     }
    1295             : 
    1296       51022 :   } else if (Src0.isReg() && !Src1.isReg()) {
    1297             :     // src0 should always be able to support any operand type, so no need to
    1298             :     // check operand legality.
    1299       14096 :     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
    1300       45660 :   } else if (!Src0.isReg() && Src1.isReg()) {
    1301       22829 :     if (isOperandLegal(MI, Src1Idx, &Src0))
    1302       10920 :       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
    1303             :   } else {
    1304             :     // FIXME: Found two non registers to commute. This does happen.
    1305             :     return nullptr;
    1306             :   }
    1307             : 
    1308      189725 :   if (CommutedMI) {
    1309      189725 :     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
    1310             :                         Src1, AMDGPU::OpName::src1_modifiers);
    1311             : 
    1312      379450 :     CommutedMI->setDesc(get(CommutedOpcode));
    1313             :   }
    1314             : 
    1315             :   return CommutedMI;
    1316             : }
    1317             : 
    1318             : // This needs to be implemented because the source modifiers may be inserted
    1319             : // between the true commutable operands, and the base
    1320             : // TargetInstrInfo::commuteInstruction uses it.
    1321      261878 : bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
    1322             :                                         unsigned &SrcOpIdx1) const {
    1323      261878 :   if (!MI.isCommutable())
    1324             :     return false;
    1325             : 
    1326      456932 :   unsigned Opc = MI.getOpcode();
    1327      228466 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    1328      228466 :   if (Src0Idx == -1)
    1329             :     return false;
    1330             : 
    1331      228466 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    1332      228466 :   if (Src1Idx == -1)
    1333             :     return false;
    1334             : 
    1335      228466 :   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
    1336             : }
    1337             : 
    1338         924 : bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
    1339             :                                         int64_t BrOffset) const {
    1340             :   // BranchRelaxation should never have to check s_setpc_b64 because its dest
    1341             :   // block is unanalyzable.
    1342             :   assert(BranchOp != AMDGPU::S_SETPC_B64);
    1343             : 
    1344             :   // Convert to dwords.
    1345         924 :   BrOffset /= 4;
    1346             : 
    1347             :   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
    1348             :   // from the next instruction.
    1349         924 :   BrOffset -= 1;
    1350             : 
    1351        1848 :   return isIntN(BranchOffsetBits, BrOffset);
    1352             : }
    1353             : 
    1354         953 : MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
    1355             :   const MachineInstr &MI) const {
    1356        1906 :   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
    1357             :     // This would be a difficult analysis to perform, but can always be legal so
    1358             :     // there's no need to analyze it.
    1359             :     return nullptr;
    1360             :   }
    1361             : 
    1362         953 :   return MI.getOperand(0).getMBB();
    1363             : }
    1364             : 
    1365          29 : unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
    1366             :                                            MachineBasicBlock &DestBB,
    1367             :                                            const DebugLoc &DL,
    1368             :                                            int64_t BrOffset,
    1369             :                                            RegScavenger *RS) const {
    1370             :   assert(RS && "RegScavenger required for long branching");
    1371             :   assert(MBB.empty() &&
    1372             :          "new block should be inserted for expanding unconditional branch");
    1373             :   assert(MBB.pred_size() == 1);
    1374             : 
    1375          29 :   MachineFunction *MF = MBB.getParent();
    1376          29 :   MachineRegisterInfo &MRI = MF->getRegInfo();
    1377             : 
    1378             :   // FIXME: Virtual register workaround for RegScavenger not working with empty
    1379             :   // blocks.
    1380          29 :   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    1381             : 
    1382          29 :   auto I = MBB.end();
    1383             : 
    1384             :   // We need to compute the offset relative to the instruction immediately after
    1385             :   // s_getpc_b64. Insert pc arithmetic code before last terminator.
    1386          58 :   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
    1387             : 
    1388             :   // TODO: Handle > 32-bit block address.
    1389          29 :   if (BrOffset >= 0) {
    1390          63 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
    1391          21 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1392          21 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1393          21 :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
    1394          63 :     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
    1395          21 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1396          21 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1397          21 :       .addImm(0);
    1398             :   } else {
    1399             :     // Backwards branch.
    1400          24 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
    1401           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
    1402           8 :       .addReg(PCReg, 0, AMDGPU::sub0)
    1403           8 :       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
    1404          24 :     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
    1405           8 :       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
    1406           8 :       .addReg(PCReg, 0, AMDGPU::sub1)
    1407           8 :       .addImm(0);
    1408             :   }
    1409             : 
    1410             :   // Insert the indirect branch after the other terminator.
    1411         116 :   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
    1412          29 :     .addReg(PCReg);
    1413             : 
    1414             :   // FIXME: If spilling is necessary, this will fail because this scavenger has
    1415             :   // no emergency stack slots. It is non-trivial to spill in this situation,
    1416             :   // because the restore code needs to be specially placed after the
    1417             :   // jump. BranchRelaxation then needs to be made aware of the newly inserted
    1418             :   // block.
    1419             :   //
    1420             :   // If a spill is needed for the pc register pair, we need to insert a spill
    1421             :   // restore block right before the destination block, and insert a short branch
    1422             :   // into the old destination block's fallthrough predecessor.
    1423             :   // e.g.:
    1424             :   //
    1425             :   // s_cbranch_scc0 skip_long_branch:
    1426             :   //
    1427             :   // long_branch_bb:
    1428             :   //   spill s[8:9]
    1429             :   //   s_getpc_b64 s[8:9]
    1430             :   //   s_add_u32 s8, s8, restore_bb
    1431             :   //   s_addc_u32 s9, s9, 0
    1432             :   //   s_setpc_b64 s[8:9]
    1433             :   //
    1434             :   // skip_long_branch:
    1435             :   //   foo;
    1436             :   //
    1437             :   // .....
    1438             :   //
    1439             :   // dest_bb_fallthrough_predecessor:
    1440             :   // bar;
    1441             :   // s_branch dest_bb
    1442             :   //
    1443             :   // restore_bb:
    1444             :   //  restore s[8:9]
    1445             :   //  fallthrough dest_bb
    1446             :   ///
    1447             :   // dest_bb:
    1448             :   //   buzz;
    1449             : 
    1450          29 :   RS->enterBasicBlockEnd(MBB);
    1451          57 :   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
    1452          29 :                                        MachineBasicBlock::iterator(GetPC), 0);
    1453          28 :   MRI.replaceRegWith(PCReg, Scav);
    1454          28 :   MRI.clearVirtRegs();
    1455          28 :   RS->setRegUsed(Scav);
    1456             : 
    1457          28 :   return 4 + 8 + 4 + 4;
    1458             : }
    1459             : 
    1460        1480 : unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
    1461        1480 :   switch (Cond) {
    1462             :   case SIInstrInfo::SCC_TRUE:
    1463             :     return AMDGPU::S_CBRANCH_SCC1;
    1464         412 :   case SIInstrInfo::SCC_FALSE:
    1465         412 :     return AMDGPU::S_CBRANCH_SCC0;
    1466         237 :   case SIInstrInfo::VCCNZ:
    1467         237 :     return AMDGPU::S_CBRANCH_VCCNZ;
    1468         243 :   case SIInstrInfo::VCCZ:
    1469         243 :     return AMDGPU::S_CBRANCH_VCCZ;
    1470          89 :   case SIInstrInfo::EXECNZ:
    1471          89 :     return AMDGPU::S_CBRANCH_EXECNZ;
    1472          59 :   case SIInstrInfo::EXECZ:
    1473          59 :     return AMDGPU::S_CBRANCH_EXECZ;
    1474           0 :   default:
    1475           0 :     llvm_unreachable("invalid branch predicate");
    1476             :   }
    1477             : }
    1478             : 
    1479      653018 : SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
    1480             :   switch (Opcode) {
    1481             :   case AMDGPU::S_CBRANCH_SCC0:
    1482             :     return SCC_FALSE;
    1483             :   case AMDGPU::S_CBRANCH_SCC1:
    1484             :     return SCC_TRUE;
    1485             :   case AMDGPU::S_CBRANCH_VCCNZ:
    1486             :     return VCCNZ;
    1487             :   case AMDGPU::S_CBRANCH_VCCZ:
    1488             :     return VCCZ;
    1489             :   case AMDGPU::S_CBRANCH_EXECNZ:
    1490             :     return EXECNZ;
    1491             :   case AMDGPU::S_CBRANCH_EXECZ:
    1492             :     return EXECZ;
    1493             :   default:
    1494             :     return INVALID_BR;
    1495             :   }
    1496             : }
    1497             : 
    1498      677270 : bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
    1499             :                                     MachineBasicBlock::iterator I,
    1500             :                                     MachineBasicBlock *&TBB,
    1501             :                                     MachineBasicBlock *&FBB,
    1502             :                                     SmallVectorImpl<MachineOperand> &Cond,
    1503             :                                     bool AllowModify) const {
    1504     1354540 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1505             :     // Unconditional Branch
    1506       24252 :     TBB = I->getOperand(0).getMBB();
    1507       24252 :     return false;
    1508             :   }
    1509             : 
    1510      653018 :   MachineBasicBlock *CondBB = nullptr;
    1511             : 
    1512      653018 :   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    1513           0 :     CondBB = I->getOperand(1).getMBB();
    1514           0 :     Cond.push_back(I->getOperand(0));
    1515             :   } else {
    1516      653018 :     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
    1517      653018 :     if (Pred == INVALID_BR)
    1518             :       return true;
    1519             : 
    1520       29062 :     CondBB = I->getOperand(0).getMBB();
    1521       58124 :     Cond.push_back(MachineOperand::CreateImm(Pred));
    1522       87186 :     Cond.push_back(I->getOperand(1)); // Save the branch register.
    1523             :   }
    1524       29062 :   ++I;
    1525             : 
    1526       58124 :   if (I == MBB.end()) {
    1527             :     // Conditional branch followed by fall-through.
    1528       15294 :     TBB = CondBB;
    1529       15294 :     return false;
    1530             :   }
    1531             : 
    1532       27536 :   if (I->getOpcode() == AMDGPU::S_BRANCH) {
    1533       13768 :     TBB = CondBB;
    1534       13768 :     FBB = I->getOperand(0).getMBB();
    1535       13768 :     return false;
    1536             :   }
    1537             : 
    1538             :   return true;
    1539             : }
    1540             : 
    1541      714986 : bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
    1542             :                                 MachineBasicBlock *&FBB,
    1543             :                                 SmallVectorImpl<MachineOperand> &Cond,
    1544             :                                 bool AllowModify) const {
    1545      714986 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1546     1429972 :   if (I == MBB.end())
    1547             :     return false;
    1548             : 
    1549     1356118 :   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
    1550      666627 :     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
    1551             : 
    1552       11432 :   ++I;
    1553             : 
    1554             :   // TODO: Should be able to treat as fallthrough?
    1555       22864 :   if (I == MBB.end())
    1556             :     return true;
    1557             : 
    1558       10643 :   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
    1559             :     return true;
    1560             : 
    1561       10643 :   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
    1562             : 
    1563             :   // Specifically handle the case where the conditional branch is to the same
    1564             :   // destination as the mask branch. e.g.
    1565             :   //
    1566             :   // si_mask_branch BB8
    1567             :   // s_cbranch_execz BB8
    1568             :   // s_cbranch BB9
    1569             :   //
    1570             :   // This is required to understand divergent loops which may need the branches
    1571             :   // to be relaxed.
    1572       10643 :   if (TBB != MaskBrDest || Cond.empty())
    1573             :     return true;
    1574             : 
    1575         524 :   auto Pred = Cond[0].getImm();
    1576         262 :   return (Pred != EXECZ && Pred != EXECNZ);
    1577             : }
    1578             : 
    1579        2371 : unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
    1580             :                                    int *BytesRemoved) const {
    1581        2371 :   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
    1582             : 
    1583        2371 :   unsigned Count = 0;
    1584        2371 :   unsigned RemovedSize = 0;
    1585       10622 :   while (I != MBB.end()) {
    1586        2940 :     MachineBasicBlock::iterator Next = std::next(I);
    1587        5882 :     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
    1588           2 :       I = Next;
    1589           2 :       continue;
    1590             :     }
    1591             : 
    1592        2938 :     RemovedSize += getInstSizeInBytes(*I);
    1593        2938 :     I->eraseFromParent();
    1594        2938 :     ++Count;
    1595        2938 :     I = Next;
    1596             :   }
    1597             : 
    1598        2371 :   if (BytesRemoved)
    1599          23 :     *BytesRemoved = RemovedSize;
    1600             : 
    1601        2371 :   return Count;
    1602             : }
    1603             : 
    1604             : // Copy the flags onto the implicit condition register operand.
    1605             : static void preserveCondRegFlags(MachineOperand &CondReg,
    1606             :                                  const MachineOperand &OrigCond) {
    1607        2828 :   CondReg.setIsUndef(OrigCond.isUndef());
    1608        2828 :   CondReg.setIsKill(OrigCond.isKill());
    1609             : }
    1610             : 
    1611        2138 : unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
    1612             :                                    MachineBasicBlock *TBB,
    1613             :                                    MachineBasicBlock *FBB,
    1614             :                                    ArrayRef<MachineOperand> Cond,
    1615             :                                    const DebugLoc &DL,
    1616             :                                    int *BytesAdded) const {
    1617        2138 :   if (!FBB && Cond.empty()) {
    1618        1974 :     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1619         658 :       .addMBB(TBB);
    1620         658 :     if (BytesAdded)
    1621           0 :       *BytesAdded = 4;
    1622             :     return 1;
    1623             :   }
    1624             : 
    1625        1480 :   if(Cond.size() == 1 && Cond[0].isReg()) {
    1626           0 :      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
    1627           0 :        .add(Cond[0])
    1628           0 :        .addMBB(TBB);
    1629           0 :      return 1;
    1630             :   }
    1631             : 
    1632             :   assert(TBB && Cond[0].isImm());
    1633             : 
    1634             :   unsigned Opcode
    1635        1480 :     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
    1636             : 
    1637        1480 :   if (!FBB) {
    1638        1388 :     Cond[1].isUndef();
    1639             :     MachineInstr *CondBr =
    1640        4164 :       BuildMI(&MBB, DL, get(Opcode))
    1641        1388 :       .addMBB(TBB);
    1642             : 
    1643             :     // Copy the flags onto the implicit condition register operand.
    1644        2776 :     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
    1645             : 
    1646        1388 :     if (BytesAdded)
    1647           0 :       *BytesAdded = 4;
    1648             :     return 1;
    1649             :   }
    1650             : 
    1651             :   assert(TBB && FBB);
    1652             : 
    1653             :   MachineInstr *CondBr =
    1654         276 :     BuildMI(&MBB, DL, get(Opcode))
    1655          92 :     .addMBB(TBB);
    1656         276 :   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
    1657          92 :     .addMBB(FBB);
    1658             : 
    1659          92 :   MachineOperand &CondReg = CondBr->getOperand(1);
    1660         276 :   CondReg.setIsUndef(Cond[1].isUndef());
    1661         276 :   CondReg.setIsKill(Cond[1].isKill());
    1662             : 
    1663          92 :   if (BytesAdded)
    1664          23 :       *BytesAdded = 8;
    1665             : 
    1666             :   return 2;
    1667             : }
    1668             : 
    1669        1230 : bool SIInstrInfo::reverseBranchCondition(
    1670             :   SmallVectorImpl<MachineOperand> &Cond) const {
    1671        2460 :   if (Cond.size() != 2) {
    1672             :     return true;
    1673             :   }
    1674             : 
    1675        3690 :   if (Cond[0].isImm()) {
    1676        4920 :     Cond[0].setImm(-Cond[0].getImm());
    1677        1230 :     return false;
    1678             :   }
    1679             : 
    1680             :   return true;
    1681             : }
    1682             : 
    1683          22 : bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
    1684             :                                   ArrayRef<MachineOperand> Cond,
    1685             :                                   unsigned TrueReg, unsigned FalseReg,
    1686             :                                   int &CondCycles,
    1687             :                                   int &TrueCycles, int &FalseCycles) const {
    1688          22 :   switch (Cond[0].getImm()) {
    1689          15 :   case VCCNZ:
    1690             :   case VCCZ: {
    1691          15 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1692          15 :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1693             :     assert(MRI.getRegClass(FalseReg) == RC);
    1694             : 
    1695          15 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1696          15 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1697             : 
    1698             :     // Limit to equal cost for branch vs. N v_cndmask_b32s.
    1699          30 :     return !RI.isSGPRClass(RC) && NumInsts <= 6;
    1700             :   }
    1701           7 :   case SCC_TRUE:
    1702             :   case SCC_FALSE: {
    1703             :     // FIXME: We could insert for VGPRs if we could replace the original compare
    1704             :     // with a vector one.
    1705           7 :     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1706           7 :     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
    1707             :     assert(MRI.getRegClass(FalseReg) == RC);
    1708             : 
    1709           7 :     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
    1710             : 
    1711             :     // Multiples of 8 can do s_cselect_b64
    1712           7 :     if (NumInsts % 2 == 0)
    1713           3 :       NumInsts /= 2;
    1714             : 
    1715           7 :     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
    1716          14 :     return RI.isSGPRClass(RC);
    1717             :   }
    1718             :   default:
    1719             :     return false;
    1720             :   }
    1721             : }
    1722             : 
    1723          16 : void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
    1724             :                                MachineBasicBlock::iterator I, const DebugLoc &DL,
    1725             :                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
    1726             :                                unsigned TrueReg, unsigned FalseReg) const {
    1727          16 :   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
    1728          16 :   if (Pred == VCCZ || Pred == SCC_FALSE) {
    1729           0 :     Pred = static_cast<BranchPredicate>(-Pred);
    1730             :     std::swap(TrueReg, FalseReg);
    1731             :   }
    1732             : 
    1733          16 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    1734          16 :   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
    1735          32 :   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
    1736             : 
    1737          16 :   if (DstSize == 32) {
    1738           9 :     unsigned SelOp = Pred == SCC_TRUE ?
    1739             :       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
    1740             : 
    1741             :     // Instruction's operands are backwards from what is expected.
    1742             :     MachineInstr *Select =
    1743          27 :       BuildMI(MBB, I, DL, get(SelOp), DstReg)
    1744           9 :       .addReg(FalseReg)
    1745           9 :       .addReg(TrueReg);
    1746             : 
    1747           9 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1748          10 :     return;
    1749             :   }
    1750             : 
    1751           7 :   if (DstSize == 64 && Pred == SCC_TRUE) {
    1752             :     MachineInstr *Select =
    1753           3 :       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
    1754           1 :       .addReg(FalseReg)
    1755           1 :       .addReg(TrueReg);
    1756             : 
    1757           1 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1758             :     return;
    1759             :   }
    1760             : 
    1761             :   static const int16_t Sub0_15[] = {
    1762             :     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    1763             :     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
    1764             :     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
    1765             :     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
    1766             :   };
    1767             : 
    1768             :   static const int16_t Sub0_15_64[] = {
    1769             :     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
    1770             :     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
    1771             :     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
    1772             :     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
    1773             :   };
    1774             : 
    1775           6 :   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
    1776           6 :   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
    1777           6 :   const int16_t *SubIndices = Sub0_15;
    1778           6 :   int NElts = DstSize / 32;
    1779             : 
    1780             :   // 64-bit select is only avaialble for SALU.
    1781           6 :   if (Pred == SCC_TRUE) {
    1782           2 :     SelOp = AMDGPU::S_CSELECT_B64;
    1783           2 :     EltRC = &AMDGPU::SGPR_64RegClass;
    1784           2 :     SubIndices = Sub0_15_64;
    1785             : 
    1786             :     assert(NElts % 2 == 0);
    1787           2 :     NElts /= 2;
    1788             :   }
    1789             : 
    1790             :   MachineInstrBuilder MIB = BuildMI(
    1791          12 :     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
    1792             : 
    1793          12 :   I = MIB->getIterator();
    1794             : 
    1795          12 :   SmallVector<unsigned, 8> Regs;
    1796          22 :   for (int Idx = 0; Idx != NElts; ++Idx) {
    1797          16 :     unsigned DstElt = MRI.createVirtualRegister(EltRC);
    1798          16 :     Regs.push_back(DstElt);
    1799             : 
    1800          16 :     unsigned SubIdx = SubIndices[Idx];
    1801             : 
    1802             :     MachineInstr *Select =
    1803          48 :       BuildMI(MBB, I, DL, get(SelOp), DstElt)
    1804          16 :       .addReg(FalseReg, 0, SubIdx)
    1805          16 :       .addReg(TrueReg, 0, SubIdx);
    1806          32 :     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
    1807             : 
    1808          16 :     MIB.addReg(DstElt)
    1809          48 :        .addImm(SubIdx);
    1810             :   }
    1811             : }
    1812             : 
    1813      755113 : bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
    1814     1510226 :   switch (MI.getOpcode()) {
    1815       22082 :   case AMDGPU::V_MOV_B32_e32:
    1816             :   case AMDGPU::V_MOV_B32_e64:
    1817             :   case AMDGPU::V_MOV_B64_PSEUDO: {
    1818             :     // If there are additional implicit register operands, this may be used for
    1819             :     // register indexing so the source register operand isn't simply copied.
    1820       22082 :     unsigned NumOps = MI.getDesc().getNumOperands() +
    1821       44164 :       MI.getDesc().getNumImplicitUses();
    1822             : 
    1823       22082 :     return MI.getNumOperands() == NumOps;
    1824             :   }
    1825             :   case AMDGPU::S_MOV_B32:
    1826             :   case AMDGPU::S_MOV_B64:
    1827             :   case AMDGPU::COPY:
    1828             :     return true;
    1829      448495 :   default:
    1830      448495 :     return false;
    1831             :   }
    1832             : }
    1833             : 
    1834          16 : static void removeModOperands(MachineInstr &MI) {
    1835          32 :   unsigned Opc = MI.getOpcode();
    1836          16 :   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1837             :                                               AMDGPU::OpName::src0_modifiers);
    1838          16 :   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1839             :                                               AMDGPU::OpName::src1_modifiers);
    1840          16 :   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    1841             :                                               AMDGPU::OpName::src2_modifiers);
    1842             : 
    1843          16 :   MI.RemoveOperand(Src2ModIdx);
    1844          16 :   MI.RemoveOperand(Src1ModIdx);
    1845          16 :   MI.RemoveOperand(Src0ModIdx);
    1846          16 : }
    1847             : 
    1848       47073 : bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
    1849             :                                 unsigned Reg, MachineRegisterInfo *MRI) const {
    1850       47073 :   if (!MRI->hasOneNonDBGUse(Reg))
    1851             :     return false;
    1852             : 
    1853       36592 :   unsigned Opc = UseMI.getOpcode();
    1854       18296 :   if (Opc == AMDGPU::COPY) {
    1855        3777 :     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
    1856        7554 :     switch (DefMI.getOpcode()) {
    1857             :     default:
    1858             :       return false;
    1859             :     case AMDGPU::S_MOV_B64:
    1860             :       // TODO: We could fold 64-bit immediates, but this get compilicated
    1861             :       // when there are sub-registers.
    1862             :       return false;
    1863             : 
    1864             :     case AMDGPU::V_MOV_B32_e32:
    1865             :     case AMDGPU::S_MOV_B32:
    1866             :       break;
    1867             :     }
    1868        3512 :     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
    1869        3512 :     const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
    1870             :     assert(ImmOp);
    1871             :     // FIXME: We could handle FrameIndex values here.
    1872        3512 :     if (!ImmOp->isImm()) {
    1873             :       return false;
    1874             :     }
    1875       10485 :     UseMI.setDesc(get(NewOpc));
    1876        6990 :     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
    1877        3495 :     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
    1878        3495 :     return true;
    1879             :   }
    1880             : 
    1881       14519 :   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
    1882       14403 :       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
    1883             :     // Don't fold if we are using source or output modifiers. The new VOP2
    1884             :     // instructions don't have them.
    1885         164 :     if (hasAnyModifiersSet(UseMI))
    1886             :       return false;
    1887             : 
    1888         264 :     const MachineOperand &ImmOp = DefMI.getOperand(1);
    1889             : 
    1890             :     // If this is a free constant, there's no reason to do this.
    1891             :     // TODO: We could fold this here instead of letting SIFoldOperands do it
    1892             :     // later.
    1893         132 :     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
    1894             : 
    1895             :     // Any src operand can be used for the legality check.
    1896         132 :     if (isInlineConstant(UseMI, *Src0, ImmOp))
    1897             :       return false;
    1898             : 
    1899          45 :     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
    1900          45 :     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
    1901          45 :     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
    1902             : 
    1903             :     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
    1904             :     // We should only expect these to be on src0 due to canonicalizations.
    1905          45 :     if (Src0->isReg() && Src0->getReg() == Reg) {
    1906           0 :       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
    1907             :         return false;
    1908             : 
    1909           0 :       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
    1910             :         return false;
    1911             : 
    1912             :       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
    1913             : 
    1914           0 :       const int64_t Imm = DefMI.getOperand(1).getImm();
    1915             : 
    1916             :       // FIXME: This would be a lot easier if we could return a new instruction
    1917             :       // instead of having to modify in place.
    1918             : 
    1919             :       // Remove these first since they are at the end.
    1920           0 :       UseMI.RemoveOperand(
    1921           0 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    1922           0 :       UseMI.RemoveOperand(
    1923           0 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    1924             : 
    1925           0 :       unsigned Src1Reg = Src1->getReg();
    1926           0 :       unsigned Src1SubReg = Src1->getSubReg();
    1927           0 :       Src0->setReg(Src1Reg);
    1928           0 :       Src0->setSubReg(Src1SubReg);
    1929           0 :       Src0->setIsKill(Src1->isKill());
    1930             : 
    1931           0 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    1932           0 :           Opc == AMDGPU::V_MAC_F16_e64)
    1933           0 :         UseMI.untieRegOperand(
    1934           0 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    1935             : 
    1936           0 :       Src1->ChangeToImmediate(Imm);
    1937             : 
    1938           0 :       removeModOperands(UseMI);
    1939           0 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
    1940             : 
    1941           0 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    1942           0 :       if (DeleteDef)
    1943           0 :         DefMI.eraseFromParent();
    1944             : 
    1945             :       return true;
    1946             :     }
    1947             : 
    1948             :     // Added part is the constant: Use v_madak_{f16, f32}.
    1949          90 :     if (Src2->isReg() && Src2->getReg() == Reg) {
    1950             :       // Not allowed to use constant bus for another operand.
    1951             :       // We can however allow an inline immediate as src0.
    1952          38 :       if (!Src0->isImm() &&
    1953          76 :           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
    1954             :         return false;
    1955             : 
    1956          51 :       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
    1957             :         return false;
    1958             : 
    1959          16 :       const int64_t Imm = DefMI.getOperand(1).getImm();
    1960             : 
    1961             :       // FIXME: This would be a lot easier if we could return a new instruction
    1962             :       // instead of having to modify in place.
    1963             : 
    1964             :       // Remove these first since they are at the end.
    1965          16 :       UseMI.RemoveOperand(
    1966          16 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
    1967          16 :       UseMI.RemoveOperand(
    1968          16 :           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
    1969             : 
    1970          16 :       if (Opc == AMDGPU::V_MAC_F32_e64 ||
    1971          16 :           Opc == AMDGPU::V_MAC_F16_e64)
    1972          16 :         UseMI.untieRegOperand(
    1973          16 :             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
    1974             : 
    1975             :       // ChangingToImmediate adds Src2 back to the instruction.
    1976          16 :       Src2->ChangeToImmediate(Imm);
    1977             : 
    1978             :       // These come before src2.
    1979          16 :       removeModOperands(UseMI);
    1980          48 :       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
    1981             : 
    1982          16 :       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    1983          16 :       if (DeleteDef)
    1984           0 :         DefMI.eraseFromParent();
    1985             : 
    1986             :       return true;
    1987             :     }
    1988             :   }
    1989             : 
    1990             :   return false;
    1991             : }
    1992             : 
    1993             : static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
    1994             :                                 int WidthB, int OffsetB) {
    1995       12005 :   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
    1996       12005 :   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
    1997       12005 :   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
    1998       12005 :   return LowOffset + LowWidth <= HighOffset;
    1999             : }
    2000             : 
    2001      835677 : bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
    2002             :                                                MachineInstr &MIb) const {
    2003             :   unsigned BaseReg0, BaseReg1;
    2004             :   int64_t Offset0, Offset1;
    2005             : 
    2006      891636 :   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
    2007       55959 :       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
    2008             : 
    2009       53124 :     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
    2010             :       // FIXME: Handle ds_read2 / ds_write2.
    2011             :       return false;
    2012             :     }
    2013       38312 :     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
    2014       38312 :     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
    2015       50317 :     if (BaseReg0 == BaseReg1 &&
    2016       24010 :         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
    2017             :       return true;
    2018             :     }
    2019             :   }
    2020             : 
    2021             :   return false;
    2022             : }
    2023             : 
    2024      910298 : bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
    2025             :                                                   MachineInstr &MIb,
    2026             :                                                   AliasAnalysis *AA) const {
    2027             :   assert((MIa.mayLoad() || MIa.mayStore()) &&
    2028             :          "MIa must load from or modify a memory location");
    2029             :   assert((MIb.mayLoad() || MIb.mayStore()) &&
    2030             :          "MIb must load from or modify a memory location");
    2031             : 
    2032      910298 :   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
    2033             :     return false;
    2034             : 
    2035             :   // XXX - Can we relax this between address spaces?
    2036      910298 :   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
    2037             :     return false;
    2038             : 
    2039      910290 :   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
    2040         445 :     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
    2041         445 :     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
    2042         886 :     if (MMOa->getValue() && MMOb->getValue()) {
    2043        1317 :       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
    2044        1317 :       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
    2045         439 :       if (!AA->alias(LocA, LocB))
    2046         186 :         return true;
    2047             :     }
    2048             :   }
    2049             : 
    2050             :   // TODO: Should we check the address space from the MachineMemOperand? That
    2051             :   // would allow us to distinguish objects we know don't alias based on the
    2052             :   // underlying address space, even if it was lowered to a different one,
    2053             :   // e.g. private accesses lowered to use MUBUF instructions on a scratch
    2054             :   // buffer.
    2055      910104 :   if (isDS(MIa)) {
    2056       80231 :     if (isDS(MIb))
    2057       33781 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2058             : 
    2059       80989 :     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
    2060             :   }
    2061             : 
    2062      871177 :   if (isMUBUF(MIa) || isMTBUF(MIa)) {
    2063      801838 :     if (isMUBUF(MIb) || isMTBUF(MIb))
    2064      775408 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2065             : 
    2066       16322 :     return !isFLAT(MIb) && !isSMRD(MIb);
    2067             :   }
    2068             : 
    2069       41266 :   if (isSMRD(MIa)) {
    2070        1331 :     if (isSMRD(MIb))
    2071           0 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2072             : 
    2073        1331 :     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
    2074             :   }
    2075             : 
    2076       39935 :   if (isFLAT(MIa)) {
    2077       39744 :     if (isFLAT(MIb))
    2078       26488 :       return checkInstOffsetsDoNotOverlap(MIa, MIb);
    2079             : 
    2080             :     return false;
    2081             :   }
    2082             : 
    2083             :   return false;
    2084             : }
    2085             : 
    2086         455 : static int64_t getFoldableImm(const MachineOperand* MO) {
    2087         455 :   if (!MO->isReg())
    2088             :     return false;
    2089         455 :   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
    2090         455 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2091         455 :   auto Def = MRI.getUniqueVRegDef(MO->getReg());
    2092        1359 :   if (Def && (Def->getOpcode() == AMDGPU::S_MOV_B32 ||
    2093         928 :               Def->getOpcode() == AMDGPU::V_MOV_B32_e32) &&
    2094          40 :      Def->getOperand(1).isImm())
    2095          20 :     return Def->getOperand(1).getImm();
    2096             :   return AMDGPU::NoRegister;
    2097             : }
    2098             : 
    2099         169 : MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
    2100             :                                                  MachineInstr &MI,
    2101             :                                                  LiveVariables *LV) const {
    2102         169 :   bool IsF16 = false;
    2103             : 
    2104         338 :   switch (MI.getOpcode()) {
    2105             :   default:
    2106             :     return nullptr;
    2107           0 :   case AMDGPU::V_MAC_F16_e64:
    2108           0 :     IsF16 = true;
    2109             :     LLVM_FALLTHROUGH;
    2110             :   case AMDGPU::V_MAC_F32_e64:
    2111             :     break;
    2112           6 :   case AMDGPU::V_MAC_F16_e32:
    2113           6 :     IsF16 = true;
    2114             :     LLVM_FALLTHROUGH;
    2115         162 :   case AMDGPU::V_MAC_F32_e32: {
    2116         162 :     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
    2117         162 :                                              AMDGPU::OpName::src0);
    2118         324 :     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
    2119         162 :     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
    2120             :       return nullptr;
    2121             :     break;
    2122             :   }
    2123             :   }
    2124             : 
    2125         167 :   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    2126         167 :   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
    2127         167 :   const MachineOperand *Src0Mods =
    2128             :     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
    2129         167 :   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
    2130         167 :   const MachineOperand *Src1Mods =
    2131             :     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
    2132         167 :   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
    2133         167 :   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2134         167 :   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2135             : 
    2136         167 :   if (!Src0Mods && !Src1Mods && !Clamp && !Omod) {
    2137         161 :     if (auto Imm = getFoldableImm(Src2)) {
    2138          10 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2139          30 :                      get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
    2140          10 :                .add(*Dst)
    2141          10 :                .add(*Src0)
    2142          10 :                .add(*Src1)
    2143          10 :                .addImm(Imm);
    2144             :     }
    2145         151 :     if (auto Imm = getFoldableImm(Src1)) {
    2146           8 :       return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2147          24 :                      get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2148           8 :                .add(*Dst)
    2149           8 :                .add(*Src0)
    2150           8 :                .addImm(Imm)
    2151           8 :                .add(*Src2);
    2152             :     }
    2153         143 :     if (auto Imm = getFoldableImm(Src0)) {
    2154           2 :       if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
    2155             :                            AMDGPU::OpName::src0), Src1))
    2156           2 :         return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2157           6 :                        get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
    2158           2 :                  .add(*Dst)
    2159           2 :                  .add(*Src1)
    2160           2 :                  .addImm(Imm)
    2161           2 :                  .add(*Src2);
    2162             :     }
    2163             :   }
    2164             : 
    2165         147 :   return BuildMI(*MBB, MI, MI.getDebugLoc(),
    2166         441 :                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
    2167         147 :       .add(*Dst)
    2168         294 :       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
    2169         147 :       .add(*Src0)
    2170         294 :       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
    2171         147 :       .add(*Src1)
    2172         147 :       .addImm(0) // Src mods
    2173         147 :       .add(*Src2)
    2174         294 :       .addImm(Clamp ? Clamp->getImm() : 0)
    2175         294 :       .addImm(Omod ? Omod->getImm() : 0);
    2176             : }
    2177             : 
    2178             : // It's not generally safe to move VALU instructions across these since it will
    2179             : // start using the register as a base index rather than directly.
    2180             : // XXX - Why isn't hasSideEffects sufficient for these?
    2181             : static bool changesVGPRIndexingMode(const MachineInstr &MI) {
    2182      423625 :   switch (MI.getOpcode()) {
    2183             :   case AMDGPU::S_SET_GPR_IDX_ON:
    2184             :   case AMDGPU::S_SET_GPR_IDX_MODE:
    2185             :   case AMDGPU::S_SET_GPR_IDX_OFF:
    2186             :     return true;
    2187             :   default:
    2188             :     return false;
    2189             :   }
    2190             : }
    2191             : 
    2192      454891 : bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
    2193             :                                        const MachineBasicBlock *MBB,
    2194             :                                        const MachineFunction &MF) const {
    2195             :   // XXX - Do we want the SP check in the base implementation?
    2196             : 
    2197             :   // Target-independent instructions do not have an implicit-use of EXEC, even
    2198             :   // when they operate on VGPRs. Treating EXEC modifications as scheduling
    2199             :   // boundaries prevents incorrect movements of such instructions.
    2200      880704 :   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
    2201     1275413 :          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
    2202      847546 :          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
    2203      878650 :          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
    2204      454891 :          changesVGPRIndexingMode(MI);
    2205             : }
    2206             : 
    2207        5143 : bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
    2208        5143 :   switch (Imm.getBitWidth()) {
    2209           0 :   case 32:
    2210           0 :     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
    2211           0 :                                         ST.hasInv2PiInlineImm());
    2212        5057 :   case 64:
    2213       10114 :     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
    2214       10114 :                                         ST.hasInv2PiInlineImm());
    2215          86 :   case 16:
    2216         172 :     return ST.has16BitInsts() &&
    2217         172 :            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
    2218          86 :                                         ST.hasInv2PiInlineImm());
    2219           0 :   default:
    2220           0 :     llvm_unreachable("invalid bitwidth");
    2221             :   }
    2222             : }
    2223             : 
    2224     3761567 : bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
    2225             :                                    uint8_t OperandType) const {
    2226     3761567 :   if (!MO.isImm() ||
    2227     3761567 :       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
    2228             :       OperandType > AMDGPU::OPERAND_SRC_LAST)
    2229             :     return false;
    2230             : 
    2231             :   // MachineOperand provides no way to tell the true operand size, since it only
    2232             :   // records a 64-bit value. We need to know the size to determine if a 32-bit
    2233             :   // floating point immediate bit pattern is legal for an integer immediate. It
    2234             :   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
    2235             : 
    2236     3711007 :   int64_t Imm = MO.getImm();
    2237     3711007 :   switch (OperandType) {
    2238     3555203 :   case AMDGPU::OPERAND_REG_IMM_INT32:
    2239             :   case AMDGPU::OPERAND_REG_IMM_FP32:
    2240             :   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2241             :   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
    2242     3555203 :     int32_t Trunc = static_cast<int32_t>(Imm);
    2243     7108973 :     return Trunc == Imm &&
    2244     3553770 :            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
    2245             :   }
    2246       38218 :   case AMDGPU::OPERAND_REG_IMM_INT64:
    2247             :   case AMDGPU::OPERAND_REG_IMM_FP64:
    2248             :   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2249             :   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2250       38218 :     return AMDGPU::isInlinableLiteral64(MO.getImm(),
    2251       76436 :                                         ST.hasInv2PiInlineImm());
    2252      114341 :   case AMDGPU::OPERAND_REG_IMM_INT16:
    2253             :   case AMDGPU::OPERAND_REG_IMM_FP16:
    2254             :   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2255             :   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2256      114341 :     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
    2257             :       // A few special case instructions have 16-bit operands on subtargets
    2258             :       // where 16-bit instructions are not legal.
    2259             :       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
    2260             :       // constants in these cases
    2261      114270 :       int16_t Trunc = static_cast<int16_t>(Imm);
    2262      228536 :       return ST.has16BitInsts() &&
    2263      114266 :              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
    2264             :     }
    2265             : 
    2266             :     return false;
    2267             :   }
    2268        3245 :   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
    2269             :   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
    2270        3245 :     uint32_t Trunc = static_cast<uint32_t>(Imm);
    2271        3245 :     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
    2272             :   }
    2273           0 :   default:
    2274           0 :     llvm_unreachable("invalid bitwidth");
    2275             :   }
    2276             : }
    2277             : 
    2278      416455 : bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
    2279             :                                         const MCOperandInfo &OpInfo) const {
    2280      416455 :   switch (MO.getType()) {
    2281             :   case MachineOperand::MO_Register:
    2282             :     return false;
    2283      124317 :   case MachineOperand::MO_Immediate:
    2284      248634 :     return !isInlineConstant(MO, OpInfo);
    2285             :   case MachineOperand::MO_FrameIndex:
    2286             :   case MachineOperand::MO_MachineBasicBlock:
    2287             :   case MachineOperand::MO_ExternalSymbol:
    2288             :   case MachineOperand::MO_GlobalAddress:
    2289             :   case MachineOperand::MO_MCSymbol:
    2290             :     return true;
    2291           0 :   default:
    2292           0 :     llvm_unreachable("unexpected operand type");
    2293             :   }
    2294             : }
    2295             : 
    2296             : static bool compareMachineOp(const MachineOperand &Op0,
    2297             :                              const MachineOperand &Op1) {
    2298       15325 :   if (Op0.getType() != Op1.getType())
    2299             :     return false;
    2300             : 
    2301       15325 :   switch (Op0.getType()) {
    2302        5045 :   case MachineOperand::MO_Register:
    2303       15325 :     return Op0.getReg() == Op1.getReg();
    2304             :   case MachineOperand::MO_Immediate:
    2305             :     return Op0.getImm() == Op1.getImm();
    2306             :   default:
    2307             :     llvm_unreachable("Didn't expect to be comparing these operand types");
    2308             :   }
    2309             : }
    2310             : 
    2311       68904 : bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
    2312             :                                     const MachineOperand &MO) const {
    2313      206712 :   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
    2314             : 
    2315             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    2316             : 
    2317       68904 :   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
    2318             :     return true;
    2319             : 
    2320       68904 :   if (OpInfo.RegClass < 0)
    2321             :     return false;
    2322             : 
    2323      137498 :   if (MO.isImm() && isInlineConstant(MO, OpInfo))
    2324       91090 :     return RI.opCanUseInlineConstant(OpInfo.OperandType);
    2325             : 
    2326       46718 :   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
    2327             : }
    2328             : 
    2329      621596 : bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
    2330      621596 :   int Op32 = AMDGPU::getVOPe32(Opcode);
    2331      621596 :   if (Op32 == -1)
    2332             :     return false;
    2333             : 
    2334       86435 :   return pseudoToMCOpcode(Op32) != -1;
    2335             : }
    2336             : 
    2337           0 : bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
    2338             :   // The src0_modifier operand is present on all instructions
    2339             :   // that have modifiers.
    2340             : 
    2341           0 :   return AMDGPU::getNamedOperandIdx(Opcode,
    2342           0 :                                     AMDGPU::OpName::src0_modifiers) != -1;
    2343             : }
    2344             : 
    2345      166168 : bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
    2346             :                                   unsigned OpName) const {
    2347      166168 :   const MachineOperand *Mods = getNamedOperand(MI, OpName);
    2348      166168 :   return Mods && Mods->getImm();
    2349             : }
    2350             : 
    2351         164 : bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
    2352         320 :   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
    2353         311 :          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
    2354         291 :          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
    2355         434 :          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
    2356         298 :          hasModifiersSet(MI, AMDGPU::OpName::omod);
    2357             : }
    2358             : 
    2359     6559902 : bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
    2360             :                                   const MachineOperand &MO,
    2361             :                                   const MCOperandInfo &OpInfo) const {
    2362             :   // Literal constants use the constant bus.
    2363             :   //if (isLiteralConstantLike(MO, OpInfo))
    2364             :   // return true;
    2365     6559902 :   if (MO.isImm())
    2366     3413066 :     return !isInlineConstant(MO, OpInfo);
    2367             : 
    2368     4853369 :   if (!MO.isReg())
    2369             :     return true; // Misc other operands like FrameIndex
    2370             : 
    2371     4844218 :   if (!MO.isUse())
    2372             :     return false;
    2373             : 
    2374     9455694 :   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    2375     7199232 :     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
    2376             : 
    2377             :   // FLAT_SCR is just an SGPR pair.
    2378     2328103 :   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
    2379             :     return true;
    2380             : 
    2381             :   // EXEC register uses the constant bus.
    2382     2328103 :   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
    2383             :     return true;
    2384             : 
    2385             :   // SGPRs use the constant bus
    2386     4577565 :   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
    2387     2776973 :           (!MO.isImplicit() &&
    2388     4254327 :            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
    2389     3199689 :             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
    2390             : }
    2391             : 
    2392     3248296 : static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
    2393     7144352 :   for (const MachineOperand &MO : MI.implicit_operands()) {
    2394             :     // We only care about reads.
    2395     4084824 :     if (MO.isDef())
    2396      478016 :       continue;
    2397             : 
    2398     3606808 :     switch (MO.getReg()) {
    2399             :     case AMDGPU::VCC:
    2400             :     case AMDGPU::M0:
    2401             :     case AMDGPU::FLAT_SCR:
    2402             :       return MO.getReg();
    2403             : 
    2404             :     default:
    2405             :       break;
    2406             :     }
    2407             :   }
    2408             : 
    2409             :   return AMDGPU::NoRegister;
    2410             : }
    2411             : 
    2412     8971765 : static bool shouldReadExec(const MachineInstr &MI) {
    2413     8971765 :   if (SIInstrInfo::isVALU(MI)) {
    2414     6398960 :     switch (MI.getOpcode()) {
    2415             :     case AMDGPU::V_READLANE_B32:
    2416             :     case AMDGPU::V_READLANE_B32_si:
    2417             :     case AMDGPU::V_READLANE_B32_vi:
    2418             :     case AMDGPU::V_WRITELANE_B32:
    2419             :     case AMDGPU::V_WRITELANE_B32_si:
    2420             :     case AMDGPU::V_WRITELANE_B32_vi:
    2421             :       return false;
    2422             :     }
    2423             : 
    2424     3175881 :     return true;
    2425             :   }
    2426             : 
    2427    17316855 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
    2428    14521189 :       SIInstrInfo::isSALU(MI) ||
    2429     2976619 :       SIInstrInfo::isSMRD(MI))
    2430             :     return false;
    2431             : 
    2432             :   return true;
    2433             : }
    2434             : 
    2435        2767 : static bool isSubRegOf(const SIRegisterInfo &TRI,
    2436             :                        const MachineOperand &SuperVec,
    2437             :                        const MachineOperand &SubReg) {
    2438        5534 :   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
    2439        3476 :     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
    2440             : 
    2441        2058 :   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
    2442        1029 :          SubReg.getReg() == SuperVec.getReg();
    2443             : }
    2444             : 
    2445    12472903 : bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
    2446             :                                     StringRef &ErrInfo) const {
    2447    24945806 :   uint16_t Opcode = MI.getOpcode();
    2448    12472903 :   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
    2449             :     return true;
    2450             : 
    2451     8971765 :   const MachineFunction *MF = MI.getParent()->getParent();
    2452     8971765 :   const MachineRegisterInfo &MRI = MF->getRegInfo();
    2453             : 
    2454     8971765 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
    2455     8971765 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
    2456     8971765 :   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
    2457             : 
    2458             :   // Make sure the number of operands is correct.
    2459    17943530 :   const MCInstrDesc &Desc = get(Opcode);
    2460    17932216 :   if (!Desc.isVariadic() &&
    2461     8960451 :       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
    2462           0 :     ErrInfo = "Instruction has wrong number of operands.";
    2463           0 :     return false;
    2464             :   }
    2465             : 
    2466     8971765 :   if (MI.isInlineAsm()) {
    2467             :     // Verify register classes for inlineasm constraints.
    2468           0 :     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
    2469           0 :          I != E; ++I) {
    2470           0 :       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
    2471           0 :       if (!RC)
    2472           0 :         continue;
    2473             : 
    2474           0 :       const MachineOperand &Op = MI.getOperand(I);
    2475           0 :       if (!Op.isReg())
    2476           0 :         continue;
    2477             : 
    2478           0 :       unsigned Reg = Op.getReg();
    2479           0 :       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
    2480           0 :         ErrInfo = "inlineasm operand has incorrect register class.";
    2481           0 :         return false;
    2482             :       }
    2483             :     }
    2484             : 
    2485             :     return true;
    2486             :   }
    2487             : 
    2488             :   // Make sure the register classes are correct.
    2489    40752708 :   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
    2490    95342829 :     if (MI.getOperand(i).isFPImm()) {
    2491           0 :       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
    2492             :                 "all fp values to integers.";
    2493           0 :       return false;
    2494             :     }
    2495             : 
    2496    31780943 :     int RegClass = Desc.OpInfo[i].RegClass;
    2497             : 
    2498    31780943 :     switch (Desc.OpInfo[i].OperandType) {
    2499    12682035 :     case MCOI::OPERAND_REGISTER:
    2500    12682035 :       if (MI.getOperand(i).isImm()) {
    2501           0 :         ErrInfo = "Illegal immediate value for operand.";
    2502           0 :         return false;
    2503             :       }
    2504             :       break;
    2505             :     case AMDGPU::OPERAND_REG_IMM_INT32:
    2506             :     case AMDGPU::OPERAND_REG_IMM_FP32:
    2507             :       break;
    2508     4187051 :     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
    2509             :     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
    2510             :     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
    2511             :     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
    2512             :     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
    2513             :     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
    2514     4187051 :       const MachineOperand &MO = MI.getOperand(i);
    2515     5930000 :       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
    2516           0 :         ErrInfo = "Illegal immediate value for operand.";
    2517           0 :         return false;
    2518             :       }
    2519             :       break;
    2520             :     }
    2521     9592345 :     case MCOI::OPERAND_IMMEDIATE:
    2522             :     case AMDGPU::OPERAND_KIMM32:
    2523             :       // Check if this operand is an immediate.
    2524             :       // FrameIndex operands will be replaced by immediates, so they are
    2525             :       // allowed.
    2526     9592345 :       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
    2527           0 :         ErrInfo = "Expected immediate, but got non-immediate";
    2528           0 :         return false;
    2529             :       }
    2530             :       LLVM_FALLTHROUGH;
    2531             :     default:
    2532    10978018 :       continue;
    2533             :     }
    2534             : 
    2535    62408775 :     if (!MI.getOperand(i).isReg())
    2536     4044884 :       continue;
    2537             : 
    2538    16758041 :     if (RegClass != -1) {
    2539    16758041 :       unsigned Reg = MI.getOperand(i).getReg();
    2540    41386060 :       if (Reg == AMDGPU::NoRegister ||
    2541    16758041 :           TargetRegisterInfo::isVirtualRegister(Reg))
    2542     7869978 :         continue;
    2543             : 
    2544    17776126 :       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
    2545    17776126 :       if (!RC->contains(Reg)) {
    2546           0 :         ErrInfo = "Operand has incorrect register class.";
    2547           0 :         return false;
    2548             :       }
    2549             :     }
    2550             :   }
    2551             : 
    2552             :   // Verify SDWA
    2553     8971765 :   if (isSDWA(MI)) {
    2554       31018 :     if (!ST.hasSDWA()) {
    2555           0 :       ErrInfo = "SDWA is not supported on this target";
    2556           0 :       return false;
    2557             :     }
    2558             : 
    2559       31018 :     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
    2560             : 
    2561       31018 :     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
    2562             : 
    2563      155090 :     for (int OpIdx: OpIndicies) {
    2564      124072 :       if (OpIdx == -1)
    2565       34109 :         continue;
    2566      179926 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2567             : 
    2568       89963 :       if (!ST.hasSDWAScalar()) {
    2569             :         // Only VGPRS on VI
    2570       84823 :         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
    2571           0 :           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
    2572           0 :           return false;
    2573             :         }
    2574             :       } else {
    2575             :         // No immediates on GFX9
    2576        5140 :         if (!MO.isReg()) {
    2577           0 :           ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
    2578           0 :           return false;
    2579             :         }
    2580             :       }
    2581             :     }
    2582             : 
    2583       31018 :     if (!ST.hasSDWAOmod()) {
    2584             :       // No omod allowed on VI
    2585       29230 :       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2586       35363 :       if (OMod != nullptr &&
    2587       12266 :         (!OMod->isImm() || OMod->getImm() != 0)) {
    2588           0 :         ErrInfo = "OMod not allowed in SDWA instructions on VI";
    2589           0 :         return false;
    2590             :       }
    2591             :     }
    2592             : 
    2593       31018 :     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
    2594       62036 :     if (isVOPC(BasicOpcode)) {
    2595          58 :       if (!ST.hasSDWASdst() && DstIdx != -1) {
    2596             :         // Only vcc allowed as dst on VI for VOPC
    2597           0 :         const MachineOperand &Dst = MI.getOperand(DstIdx);
    2598           0 :         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
    2599           0 :           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
    2600           0 :           return false;
    2601             :         }
    2602          58 :       } else if (!ST.hasSDWAOutModsVOPC()) {
    2603             :         // No clamp allowed on GFX9 for VOPC
    2604          45 :         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
    2605          90 :         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
    2606           0 :           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
    2607           0 :           return false;
    2608             :         }
    2609             : 
    2610             :         // No omod allowed on GFX9 for VOPC
    2611          45 :         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
    2612          45 :         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
    2613           0 :           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
    2614           0 :           return false;
    2615             :         }
    2616             :       }
    2617             :     }
    2618             :   }
    2619             : 
    2620             :   // Verify VOP*
    2621    35798064 :   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
    2622             :     // Only look at the true operands. Only a real operand can use the constant
    2623             :     // bus, and we don't want to check pseudo-operands like the source modifier
    2624             :     // flags.
    2625     3184115 :     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
    2626             : 
    2627     3184115 :     unsigned ConstantBusCount = 0;
    2628             : 
    2629     3184115 :     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
    2630         978 :       ++ConstantBusCount;
    2631             : 
    2632     3184115 :     unsigned SGPRUsed = findImplicitSGPRRead(MI);
    2633     3184115 :     if (SGPRUsed != AMDGPU::NoRegister)
    2634      183872 :       ++ConstantBusCount;
    2635             : 
    2636    15292875 :     for (int OpIdx : OpIndices) {
    2637     8671370 :       if (OpIdx == -1)
    2638             :         break;
    2639    12108760 :       const MachineOperand &MO = MI.getOperand(OpIdx);
    2640     6054380 :       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
    2641     1251494 :         if (MO.isReg()) {
    2642     1114685 :           if (MO.getReg() != SGPRUsed)
    2643     1101289 :             ++ConstantBusCount;
    2644             :           SGPRUsed = MO.getReg();
    2645             :         } else {
    2646      136809 :           ++ConstantBusCount;
    2647             :         }
    2648             :       }
    2649             :     }
    2650     3184115 :     if (ConstantBusCount > 1) {
    2651           0 :       ErrInfo = "VOP* instruction uses the constant bus more than once";
    2652           0 :       return false;
    2653             :     }
    2654             :   }
    2655             : 
    2656             :   // Verify misc. restrictions on specific instructions.
    2657     8971765 :   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
    2658             :       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
    2659       21424 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    2660       21424 :     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
    2661       21424 :     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
    2662       31704 :     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
    2663       15325 :       if (!compareMachineOp(Src0, Src1) &&
    2664        5045 :           !compareMachineOp(Src0, Src2)) {
    2665           0 :         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
    2666           0 :         return false;
    2667             :       }
    2668             :     }
    2669             :   }
    2670             : 
    2671     8971765 :   if (isSOPK(MI)) {
    2672       10268 :     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
    2673       10268 :     if (sopkIsZext(MI)) {
    2674         672 :       if (!isUInt<16>(Imm)) {
    2675           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    2676           0 :         return false;
    2677             :       }
    2678             :     } else {
    2679        9596 :       if (!isInt<16>(Imm)) {
    2680           0 :         ErrInfo = "invalid immediate for SOPK instruction";
    2681           0 :         return false;
    2682             :       }
    2683             :     }
    2684             :   }
    2685             : 
    2686     8969703 :   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
    2687     8969703 :       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
    2688    17940763 :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    2689             :       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
    2690        2767 :     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
    2691        2767 :                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
    2692             : 
    2693             :     const unsigned StaticNumOps = Desc.getNumOperands() +
    2694        2767 :       Desc.getNumImplicitUses();
    2695        2767 :     const unsigned NumImplicitOps = IsDst ? 2 : 1;
    2696             : 
    2697             :     // Allow additional implicit operands. This allows a fixup done by the post
    2698             :     // RA scheduler where the main implicit operand is killed and implicit-defs
    2699             :     // are added for sub-registers that remain live after this instruction.
    2700        2767 :     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
    2701           0 :       ErrInfo = "missing implicit register operands";
    2702           0 :       return false;
    2703             :     }
    2704             : 
    2705        2767 :     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
    2706        2767 :     if (IsDst) {
    2707         705 :       if (!Dst->isUse()) {
    2708           0 :         ErrInfo = "v_movreld_b32 vdst should be a use operand";
    2709           0 :         return false;
    2710             :       }
    2711             : 
    2712             :       unsigned UseOpIdx;
    2713        1410 :       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
    2714         705 :           UseOpIdx != StaticNumOps + 1) {
    2715           0 :         ErrInfo = "movrel implicit operands should be tied";
    2716           0 :         return false;
    2717             :       }
    2718             :     }
    2719             : 
    2720        5534 :     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
    2721             :     const MachineOperand &ImpUse
    2722        5534 :       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
    2723        8301 :     if (!ImpUse.isReg() || !ImpUse.isUse() ||
    2724        2767 :         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
    2725           0 :       ErrInfo = "src0 should be subreg of implicit vector use";
    2726           0 :       return false;
    2727             :     }
    2728             :   }
    2729             : 
    2730             :   // Make sure we aren't losing exec uses in the td files. This mostly requires
    2731             :   // being careful when using let Uses to try to add other use registers.
    2732     8971765 :   if (shouldReadExec(MI)) {
    2733     4959248 :     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
    2734           0 :       ErrInfo = "VALU instruction does not implicitly read exec mask";
    2735           0 :       return false;
    2736             :     }
    2737             :   }
    2738             : 
    2739     8971765 :   if (isSMRD(MI)) {
    2740     1193252 :     if (MI.mayStore()) {
    2741             :       // The register offset form of scalar stores may only use m0 as the
    2742             :       // soffset register.
    2743         834 :       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
    2744         834 :       if (Soff && Soff->getReg() != AMDGPU::M0) {
    2745           0 :         ErrInfo = "scalar stores must use m0 as offset register";
    2746           0 :         return false;
    2747             :       }
    2748             :     }
    2749             :   }
    2750             : 
    2751     8971765 :   if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
    2752      274592 :     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    2753      274592 :     if (Offset->getImm() != 0) {
    2754           0 :       ErrInfo = "subtarget does not support offsets in flat instructions";
    2755           0 :       return false;
    2756             :     }
    2757             :   }
    2758             : 
    2759             :   return true;
    2760             : }
    2761             : 
    2762      106750 : unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
    2763      213500 :   switch (MI.getOpcode()) {
    2764             :   default: return AMDGPU::INSTRUCTION_LIST_END;
    2765       21326 :   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
    2766       47327 :   case AMDGPU::COPY: return AMDGPU::COPY;
    2767         329 :   case AMDGPU::PHI: return AMDGPU::PHI;
    2768          22 :   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
    2769           4 :   case AMDGPU::WQM: return AMDGPU::WQM;
    2770           4 :   case AMDGPU::WWM: return AMDGPU::WWM;
    2771          23 :   case AMDGPU::S_MOV_B32:
    2772          46 :     return MI.getOperand(1).isReg() ?
    2773             :            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
    2774        5322 :   case AMDGPU::S_ADD_I32:
    2775        5322 :   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
    2776        3524 :   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
    2777        2179 :   case AMDGPU::S_SUB_I32:
    2778        2179 :   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
    2779        1249 :   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
    2780         466 :   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
    2781        3243 :   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
    2782        3595 :   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
    2783         213 :   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
    2784          81 :   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
    2785          50 :   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
    2786          67 :   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
    2787          16 :   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
    2788        1770 :   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
    2789         213 :   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
    2790        4410 :   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
    2791        1658 :   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
    2792        2691 :   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
    2793         138 :   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
    2794         357 :   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
    2795         720 :   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
    2796        2004 :   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
    2797        1467 :   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
    2798           0 :   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
    2799          12 :   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
    2800           4 :   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
    2801          16 :   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
    2802           0 :   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
    2803           0 :   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
    2804          10 :   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
    2805           4 :   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
    2806           5 :   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
    2807           0 :   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
    2808          22 :   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
    2809          29 :   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
    2810           0 :   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
    2811           3 :   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
    2812           2 :   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
    2813           0 :   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
    2814           1 :   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
    2815           1 :   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
    2816          64 :   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
    2817          14 :   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
    2818         158 :   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
    2819           2 :   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
    2820           0 :   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
    2821          77 :   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
    2822             :   }
    2823             : }
    2824             : 
    2825           0 : bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
    2826           0 :   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
    2827             : }
    2828             : 
    2829     1721335 : const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
    2830             :                                                       unsigned OpNo) const {
    2831     1721335 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    2832     5164005 :   const MCInstrDesc &Desc = get(MI.getOpcode());
    2833     4654149 :   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
    2834     1326125 :       Desc.OpInfo[OpNo].RegClass == -1) {
    2835     1512984 :     unsigned Reg = MI.getOperand(OpNo).getReg();
    2836             : 
    2837      756492 :     if (TargetRegisterInfo::isVirtualRegister(Reg))
    2838      447255 :       return MRI.getRegClass(Reg);
    2839      309237 :     return RI.getPhysRegClass(Reg);
    2840             :   }
    2841             : 
    2842      964843 :   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
    2843     1929686 :   return RI.getRegClass(RCID);
    2844             : }
    2845             : 
    2846      137021 : bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
    2847      137021 :   switch (MI.getOpcode()) {
    2848       88424 :   case AMDGPU::COPY:
    2849             :   case AMDGPU::REG_SEQUENCE:
    2850             :   case AMDGPU::PHI:
    2851             :   case AMDGPU::INSERT_SUBREG:
    2852       88424 :     return RI.hasVGPRs(getOpRegClass(MI, 0));
    2853       48597 :   default:
    2854       48597 :     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
    2855             :   }
    2856             : }
    2857             : 
    2858       20090 : void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
    2859       20090 :   MachineBasicBlock::iterator I = MI;
    2860       20090 :   MachineBasicBlock *MBB = MI.getParent();
    2861       40180 :   MachineOperand &MO = MI.getOperand(OpIdx);
    2862       20090 :   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    2863       60270 :   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
    2864       40180 :   const TargetRegisterClass *RC = RI.getRegClass(RCID);
    2865       20090 :   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
    2866       20090 :   if (MO.isReg())
    2867             :     Opcode = AMDGPU::COPY;
    2868           0 :   else if (RI.isSGPRClass(RC))
    2869           0 :     Opcode = AMDGPU::S_MOV_B32;
    2870             : 
    2871       20090 :   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
    2872       20090 :   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
    2873             :     VRC = &AMDGPU::VReg_64RegClass;
    2874             :   else
    2875       17904 :     VRC = &AMDGPU::VGPR_32RegClass;
    2876             : 
    2877       20090 :   unsigned Reg = MRI.createVirtualRegister(VRC);
    2878       40180 :   DebugLoc DL = MBB->findDebugLoc(I);
    2879       60270 :   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
    2880       20090 :   MO.ChangeToRegister(Reg, false);
    2881       20090 : }
    2882             : 
    2883        1948 : unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
    2884             :                                          MachineRegisterInfo &MRI,
    2885             :                                          MachineOperand &SuperReg,
    2886             :                                          const TargetRegisterClass *SuperRC,
    2887             :                                          unsigned SubIdx,
    2888             :                                          const TargetRegisterClass *SubRC)
    2889             :                                          const {
    2890        1948 :   MachineBasicBlock *MBB = MI->getParent();
    2891        5844 :   DebugLoc DL = MI->getDebugLoc();
    2892        1948 :   unsigned SubReg = MRI.createVirtualRegister(SubRC);
    2893             : 
    2894        1948 :   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
    2895        5844 :     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    2896        1948 :       .addReg(SuperReg.getReg(), 0, SubIdx);
    2897        1948 :     return SubReg;
    2898             :   }
    2899             : 
    2900             :   // Just in case the super register is itself a sub-register, copy it to a new
    2901             :   // value so we don't need to worry about merging its subreg index with the
    2902             :   // SubIdx passed to this function. The register coalescer should be able to
    2903             :   // eliminate this extra copy.
    2904           0 :   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
    2905             : 
    2906           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
    2907           0 :     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
    2908             : 
    2909           0 :   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
    2910           0 :     .addReg(NewSuperReg, 0, SubIdx);
    2911             : 
    2912           0 :   return SubReg;
    2913             : }
    2914             : 
    2915        1912 : MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
    2916             :   MachineBasicBlock::iterator MII,
    2917             :   MachineRegisterInfo &MRI,
    2918             :   MachineOperand &Op,
    2919             :   const TargetRegisterClass *SuperRC,
    2920             :   unsigned SubIdx,
    2921             :   const TargetRegisterClass *SubRC) const {
    2922        1912 :   if (Op.isImm()) {
    2923           0 :     if (SubIdx == AMDGPU::sub0)
    2924           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
    2925           0 :     if (SubIdx == AMDGPU::sub1)
    2926           0 :       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
    2927             : 
    2928           0 :     llvm_unreachable("Unhandled register index for immediate");
    2929             :   }
    2930             : 
    2931             :   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
    2932        1912 :                                        SubIdx, SubRC);
    2933             :   return MachineOperand::CreateReg(SubReg, false);
    2934             : }
    2935             : 
    2936             : // Change the order of operands from (0, 1, 2) to (0, 2, 1)
    2937        4922 : void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
    2938             :   assert(Inst.getNumExplicitOperands() == 3);
    2939        4922 :   MachineOperand Op1 = Inst.getOperand(1);
    2940        4922 :   Inst.RemoveOperand(1);
    2941        4922 :   Inst.addOperand(Op1);
    2942        4922 : }
    2943             : 
    2944      320363 : bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
    2945             :                                     const MCOperandInfo &OpInfo,
    2946             :                                     const MachineOperand &MO) const {
    2947      320363 :   if (!MO.isReg())
    2948             :     return false;
    2949             : 
    2950      319899 :   unsigned Reg = MO.getReg();
    2951             :   const TargetRegisterClass *RC =
    2952      639798 :     TargetRegisterInfo::isVirtualRegister(Reg) ?
    2953             :     MRI.getRegClass(Reg) :
    2954      324247 :     RI.getPhysRegClass(Reg);
    2955             : 
    2956             :   const SIRegisterInfo *TRI =
    2957      639798 :       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
    2958      319899 :   RC = TRI->getSubRegClass(RC, MO.getSubReg());
    2959             : 
    2960             :   // In order to be legal, the common sub-class must be equal to the
    2961             :   // class of the current operand.  For example:
    2962             :   //
    2963             :   // v_mov_b32 s0 ; Operand defined as vsrc_b32
    2964             :   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
    2965             :   //
    2966             :   // s_sendmsg 0, s0 ; Operand defined as m0reg
    2967             :   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
    2968             : 
    2969      639798 :   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
    2970             : }
    2971             : 
    2972           0 : bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
    2973             :                                      const MCOperandInfo &OpInfo,
    2974             :                                      const MachineOperand &MO) const {
    2975           0 :   if (MO.isReg())
    2976           0 :     return isLegalRegOperand(MRI, OpInfo, MO);
    2977             : 
    2978             :   // Handle non-register types that are treated like immediates.
    2979             :   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
    2980             :   return true;
    2981             : }
    2982             : 
    2983      391358 : bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
    2984             :                                  const MachineOperand *MO) const {
    2985      391358 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    2986      391358 :   const MCInstrDesc &InstDesc = MI.getDesc();
    2987      391358 :   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
    2988             :   const TargetRegisterClass *DefinedRC =
    2989      782716 :       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
    2990      391358 :   if (!MO)
    2991           0 :     MO = &MI.getOperand(OpIdx);
    2992             : 
    2993      391358 :   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
    2994             : 
    2995       88177 :     RegSubRegPair SGPRUsed;
    2996       88177 :     if (MO->isReg())
    2997       69704 :       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
    2998             : 
    2999      456786 :     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
    3000      396921 :       if (i == OpIdx)
    3001       81344 :         continue;
    3002      631154 :       const MachineOperand &Op = MI.getOperand(i);
    3003      315577 :       if (Op.isReg()) {
    3004      540470 :         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
    3005      232742 :             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
    3006             :           return false;
    3007             :         }
    3008       45345 :       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
    3009             :         return false;
    3010             :       }
    3011             :     }
    3012             :   }
    3013             : 
    3014      363046 :   if (MO->isReg()) {
    3015             :     assert(DefinedRC);
    3016      294477 :     return isLegalRegOperand(MRI, OpInfo, *MO);
    3017             :   }
    3018             : 
    3019             :   // Handle non-register types that are treated like immediates.
    3020             :   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
    3021             : 
    3022       68569 :   if (!DefinedRC) {
    3023             :     // This operand expects an immediate.
    3024             :     return true;
    3025             :   }
    3026             : 
    3027       68569 :   return isImmOperandLegal(MI, OpIdx, *MO);
    3028             : }
    3029             : 
    3030       17184 : void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
    3031             :                                        MachineInstr &MI) const {
    3032       34368 :   unsigned Opc = MI.getOpcode();
    3033       34368 :   const MCInstrDesc &InstrDesc = get(Opc);
    3034             : 
    3035       17184 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    3036       34368 :   MachineOperand &Src1 = MI.getOperand(Src1Idx);
    3037             : 
    3038             :   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
    3039             :   // we need to only have one constant bus use.
    3040             :   //
    3041             :   // Note we do not need to worry about literal constants here. They are
    3042             :   // disabled for the operand type for instructions because they will always
    3043             :   // violate the one constant bus use rule.
    3044       17184 :   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
    3045       17184 :   if (HasImplicitSGPR) {
    3046        4773 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3047        9546 :     MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3048             : 
    3049        4773 :     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
    3050        2687 :       legalizeOpWithMove(MI, Src0Idx);
    3051             :   }
    3052             : 
    3053             :   // VOP2 src0 instructions support all operand types, so we don't need to check
    3054             :   // their legality. If src1 is already legal, we don't need to do anything.
    3055       17184 :   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
    3056             :     return;
    3057             : 
    3058             :   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
    3059             :   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
    3060             :   // select is uniform.
    3061       10791 :   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
    3062           1 :       RI.isVGPR(MRI, Src1.getReg())) {
    3063           1 :     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
    3064           1 :     const DebugLoc &DL = MI.getDebugLoc();
    3065           3 :     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
    3066           1 :         .add(Src1);
    3067           1 :     Src1.ChangeToRegister(Reg, false);
    3068           1 :     return;
    3069             :   }
    3070             : 
    3071             :   // We do not use commuteInstruction here because it is too aggressive and will
    3072             :   // commute if it is possible. We only want to commute here if it improves
    3073             :   // legality. This can be called a fairly large number of times so don't waste
    3074             :   // compile time pointlessly swapping and checking legality again.
    3075       19490 :   if (HasImplicitSGPR || !MI.isCommutable()) {
    3076        2086 :     legalizeOpWithMove(MI, Src1Idx);
    3077        2086 :     return;
    3078             :   }
    3079             : 
    3080        8702 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    3081       17404 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    3082             : 
    3083             :   // If src0 can be used as src1, commuting will make the operands legal.
    3084             :   // Otherwise we have to give up and insert a move.
    3085             :   //
    3086             :   // TODO: Other immediate-like operand kinds could be commuted if there was a
    3087             :   // MachineOperand::ChangeTo* for them.
    3088       25642 :   if ((!Src1.isImm() && !Src1.isReg()) ||
    3089        8702 :       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
    3090           0 :     legalizeOpWithMove(MI, Src1Idx);
    3091           0 :     return;
    3092             :   }
    3093             : 
    3094        8702 :   int CommutedOpc = commuteOpcode(MI);
    3095        8702 :   if (CommutedOpc == -1) {
    3096           0 :     legalizeOpWithMove(MI, Src1Idx);
    3097           0 :     return;
    3098             :   }
    3099             : 
    3100       26106 :   MI.setDesc(get(CommutedOpc));
    3101             : 
    3102        8702 :   unsigned Src0Reg = Src0.getReg();
    3103        8702 :   unsigned Src0SubReg = Src0.getSubReg();
    3104        8702 :   bool Src0Kill = Src0.isKill();
    3105             : 
    3106        8702 :   if (Src1.isImm())
    3107         464 :     Src0.ChangeToImmediate(Src1.getImm());
    3108        8238 :   else if (Src1.isReg()) {
    3109        8238 :     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
    3110        8238 :     Src0.setSubReg(Src1.getSubReg());
    3111             :   } else
    3112           0 :     llvm_unreachable("Should only have register or immediate operands");
    3113             : 
    3114        8702 :   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
    3115             :   Src1.setSubReg(Src0SubReg);
    3116             : }
    3117             : 
    3118             : // Legalize VOP3 operands. Because all operand types are supported for any
    3119             : // operand, and since literal constants are not allowed and should never be
    3120             : // seen, we only need to worry about inserting copies if we use multiple SGPR
    3121             : // operands.
    3122       46997 : void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
    3123             :                                        MachineInstr &MI) const {
    3124       93994 :   unsigned Opc = MI.getOpcode();
    3125             : 
    3126             :   int VOP3Idx[3] = {
    3127       46997 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
    3128       46997 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
    3129       46997 :     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
    3130      140991 :   };
    3131             : 
    3132             :   // Find the one SGPR operand we are allowed to use.
    3133       46997 :   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
    3134             : 
    3135      152088 :   for (unsigned i = 0; i < 3; ++i) {
    3136      137335 :     int Idx = VOP3Idx[i];
    3137      137335 :     if (Idx == -1)
    3138             :       break;
    3139      210182 :     MachineOperand &MO = MI.getOperand(Idx);
    3140             : 
    3141             :     // We should never see a VOP3 instruction with an illegal immediate operand.
    3142      105091 :     if (!MO.isReg())
    3143       10827 :       continue;
    3144             : 
    3145      282792 :     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
    3146       46430 :       continue; // VGPRs are legal
    3147             : 
    3148       80351 :     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
    3149       32517 :       SGPRReg = MO.getReg();
    3150             :       // We can use one SGPR in each VOP3 instruction.
    3151       32517 :       continue;
    3152             :     }
    3153             : 
    3154             :     // If we make it this far, then the operand is not legal and we must
    3155             :     // legalize it.
    3156       15317 :     legalizeOpWithMove(MI, Idx);
    3157             :   }
    3158       46997 : }
    3159             : 
    3160          21 : unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
    3161             :                                          MachineRegisterInfo &MRI) const {
    3162          21 :   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
    3163          21 :   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
    3164          21 :   unsigned DstReg = MRI.createVirtualRegister(SRC);
    3165          42 :   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
    3166             : 
    3167          42 :   SmallVector<unsigned, 8> SRegs;
    3168          87 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3169          66 :     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3170         132 :     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3171         198 :             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
    3172          66 :         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
    3173          66 :     SRegs.push_back(SGPR);
    3174             :   }
    3175             : 
    3176             :   MachineInstrBuilder MIB =
    3177          21 :       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
    3178          84 :               get(AMDGPU::REG_SEQUENCE), DstReg);
    3179          87 :   for (unsigned i = 0; i < SubRegs; ++i) {
    3180         132 :     MIB.addReg(SRegs[i]);
    3181         132 :     MIB.addImm(RI.getSubRegFromChannel(i));
    3182             :   }
    3183          42 :   return DstReg;
    3184             : }
    3185             : 
    3186          13 : void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
    3187             :                                        MachineInstr &MI) const {
    3188             : 
    3189             :   // If the pointer is store in VGPRs, then we need to move them to
    3190             :   // SGPRs using v_readfirstlane.  This is safe because we only select
    3191             :   // loads with uniform pointers to SMRD instruction so we know the
    3192             :   // pointer value is uniform.
    3193          13 :   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
    3194          39 :   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
    3195          13 :       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
    3196          13 :       SBase->setReg(SGPR);
    3197             :   }
    3198          13 : }
    3199             : 
    3200       27402 : void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
    3201             :                                          MachineBasicBlock::iterator I,
    3202             :                                          const TargetRegisterClass *DstRC,
    3203             :                                          MachineOperand &Op,
    3204             :                                          MachineRegisterInfo &MRI,
    3205             :                                          const DebugLoc &DL) const {
    3206       27402 :   unsigned OpReg = Op.getReg();
    3207       27402 :   unsigned OpSubReg = Op.getSubReg();
    3208             : 
    3209       27402 :   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
    3210       27402 :       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
    3211             : 
    3212             :   // Check if operand is already the correct register class.
    3213       27402 :   if (DstRC == OpRC)
    3214             :     return;
    3215             : 
    3216       27169 :   unsigned DstReg = MRI.createVirtualRegister(DstRC);
    3217             :   MachineInstr *Copy =
    3218       81507 :       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
    3219             : 
    3220       27169 :   Op.setReg(DstReg);
    3221       27169 :   Op.setSubReg(0);
    3222             : 
    3223       27169 :   MachineInstr *Def = MRI.getVRegDef(OpReg);
    3224       27169 :   if (!Def)
    3225             :     return;
    3226             : 
    3227             :   // Try to eliminate the copy if it is copying an immediate value.
    3228       27169 :   if (Def->isMoveImmediate())
    3229        5467 :     FoldImmediate(*Copy, *Def, OpReg, &MRI);
    3230             : }
    3231             : 
    3232       90420 : void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
    3233       90420 :   MachineFunction &MF = *MI.getParent()->getParent();
    3234       90420 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3235             : 
    3236             :   // Legalize VOP2
    3237      163733 :   if (isVOP2(MI) || isVOPC(MI)) {
    3238       17184 :     legalizeOperandsVOP2(MRI, MI);
    3239       17184 :     return;
    3240             :   }
    3241             : 
    3242             :   // Legalize VOP3
    3243       73236 :   if (isVOP3(MI)) {
    3244       19307 :     legalizeOperandsVOP3(MRI, MI);
    3245       19307 :     return;
    3246             :   }
    3247             : 
    3248             :   // Legalize SMRD
    3249       53929 :   if (isSMRD(MI)) {
    3250          13 :     legalizeOperandsSMRD(MRI, MI);
    3251          13 :     return;
    3252             :   }
    3253             : 
    3254             :   // Legalize REG_SEQUENCE and PHI
    3255             :   // The register class of the operands much be the same type as the register
    3256             :   // class of the output.
    3257      107832 :   if (MI.getOpcode() == AMDGPU::PHI) {
    3258         329 :     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
    3259         993 :     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
    3260        2656 :       if (!MI.getOperand(i).isReg() ||
    3261        1328 :           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
    3262           0 :         continue;
    3263             :       const TargetRegisterClass *OpRC =
    3264        1328 :           MRI.getRegClass(MI.getOperand(i).getReg());
    3265         664 :       if (RI.hasVGPRs(OpRC)) {
    3266             :         VRC = OpRC;
    3267             :       } else {
    3268         431 :         SRC = OpRC;
    3269             :       }
    3270             :     }
    3271             : 
    3272             :     // If any of the operands are VGPR registers, then they all most be
    3273             :     // otherwise we will create illegal VGPR->SGPR copies when legalizing
    3274             :     // them.
    3275         437 :     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
    3276         329 :       if (!VRC) {
    3277             :         assert(SRC);
    3278         108 :         VRC = RI.getEquivalentVGPRClass(SRC);
    3279             :       }
    3280             :       RC = VRC;
    3281             :     } else {
    3282             :       RC = SRC;
    3283             :     }
    3284             : 
    3285             :     // Update all the operands so they have the same type.
    3286         993 :     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3287        1328 :       MachineOperand &Op = MI.getOperand(I);
    3288        1328 :       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3289           0 :         continue;
    3290             : 
    3291             :       // MI is a PHI instruction.
    3292        1328 :       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
    3293         664 :       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
    3294             : 
    3295             :       // Avoid creating no-op copies with the same src and dst reg class.  These
    3296             :       // confuse some of the machine passes.
    3297         664 :       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
    3298             :     }
    3299             :   }
    3300             : 
    3301             :   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
    3302             :   // VGPR dest type and SGPR sources, insert copies so all operands are
    3303             :   // VGPRs. This seems to help operand folding / the register coalescer.
    3304      107832 :   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
    3305       21326 :     MachineBasicBlock *MBB = MI.getParent();
    3306       21326 :     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
    3307       21326 :     if (RI.hasVGPRs(DstRC)) {
    3308             :       // Update all the operands so they are VGPR register classes. These may
    3309             :       // not be the same register class because REG_SEQUENCE supports mixing
    3310             :       // subregister index types e.g. sub0_sub1 + sub2 + sub3
    3311       73776 :       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
    3312      104900 :         MachineOperand &Op = MI.getOperand(I);
    3313      104900 :         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    3314           0 :           continue;
    3315             : 
    3316      104900 :         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
    3317       52450 :         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
    3318       78164 :         if (VRC == OpRC)
    3319       25714 :           continue;
    3320             : 
    3321       53472 :         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
    3322             :         Op.setIsKill();
    3323             :       }
    3324             :     }
    3325             : 
    3326             :     return;
    3327             :   }
    3328             : 
    3329             :   // Legalize INSERT_SUBREG
    3330             :   // src0 must have the same register class as dst
    3331       32590 :   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
    3332          22 :     unsigned Dst = MI.getOperand(0).getReg();
    3333          22 :     unsigned Src0 = MI.getOperand(1).getReg();
    3334          22 :     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
    3335          22 :     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
    3336          22 :     if (DstRC != Src0RC) {
    3337           2 :       MachineBasicBlock *MBB = MI.getParent();
    3338           4 :       MachineOperand &Op = MI.getOperand(1);
    3339           4 :       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
    3340             :     }
    3341             :     return;
    3342             :   }
    3343             : 
    3344             :   // Legalize MIMG and MUBUF/MTBUF for shaders.
    3345             :   //
    3346             :   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
    3347             :   // scratch memory access. In both cases, the legalization never involves
    3348             :   // conversion to the addr64 form.
    3349       65132 :   if (isMIMG(MI) ||
    3350       65336 :       (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
    3351         412 :        (isMUBUF(MI) || isMTBUF(MI)))) {
    3352           8 :     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
    3353          32 :     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
    3354           6 :       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
    3355           6 :       SRsrc->setReg(SGPR);
    3356             :     }
    3357             : 
    3358           8 :     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
    3359          16 :     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
    3360           2 :       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
    3361           2 :       SSamp->setReg(SGPR);
    3362             :     }
    3363             :     return;
    3364             :   }
    3365             : 
    3366             :   // Legalize MUBUF* instructions by converting to addr64 form.
    3367             :   // FIXME: If we start using the non-addr64 instructions for compute, we
    3368             :   // may need to legalize them as above. This especially applies to the
    3369             :   // buffer_load_format_* variants and variants with idxen (or bothen).
    3370             :   int SRsrcIdx =
    3371       32560 :       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
    3372       32560 :   if (SRsrcIdx != -1) {
    3373             :     // We have an MUBUF instruction
    3374          72 :     MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
    3375          72 :     unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
    3376         108 :     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
    3377             :                                              RI.getRegClass(SRsrcRC))) {
    3378             :       // The operands are legal.
    3379             :       // FIXME: We may need to legalize operands besided srsrc.
    3380             :       return;
    3381             :     }
    3382             : 
    3383          36 :     MachineBasicBlock &MBB = *MI.getParent();
    3384             : 
    3385             :     // Extract the ptr from the resource descriptor.
    3386          72 :     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
    3387          36 :       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
    3388             : 
    3389             :     // Create an empty resource descriptor
    3390          36 :     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    3391          36 :     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3392          36 :     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    3393          36 :     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
    3394          36 :     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
    3395             : 
    3396             :     // Zero64 = 0
    3397         144 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
    3398          36 :         .addImm(0);
    3399             : 
    3400             :     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
    3401         108 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
    3402          72 :         .addImm(RsrcDataFormat & 0xFFFFFFFF);
    3403             : 
    3404             :     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
    3405         108 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
    3406          72 :         .addImm(RsrcDataFormat >> 32);
    3407             : 
    3408             :     // NewSRsrc = {Zero64, SRsrcFormat}
    3409         108 :     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
    3410          36 :         .addReg(Zero64)
    3411          36 :         .addImm(AMDGPU::sub0_sub1)
    3412          36 :         .addReg(SRsrcFormatLo)
    3413          36 :         .addImm(AMDGPU::sub2)
    3414          36 :         .addReg(SRsrcFormatHi)
    3415          36 :         .addImm(AMDGPU::sub3);
    3416             : 
    3417          36 :     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    3418          36 :     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    3419          36 :     if (VAddr) {
    3420             :       // This is already an ADDR64 instruction so we need to add the pointer
    3421             :       // extracted from the resource descriptor to the current value of VAddr.
    3422           8 :       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3423           8 :       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3424             : 
    3425             :       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
    3426          24 :       DebugLoc DL = MI.getDebugLoc();
    3427          24 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
    3428           8 :         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
    3429           8 :         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
    3430             : 
    3431             :       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
    3432          24 :       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
    3433           8 :         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
    3434           8 :         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
    3435             : 
    3436             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    3437          24 :       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
    3438           8 :           .addReg(NewVAddrLo)
    3439           8 :           .addImm(AMDGPU::sub0)
    3440           8 :           .addReg(NewVAddrHi)
    3441           8 :           .addImm(AMDGPU::sub1);
    3442             :     } else {
    3443             :       // This instructions is the _OFFSET variant, so we need to convert it to
    3444             :       // ADDR64.
    3445             :       assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
    3446             :              < SISubtarget::VOLCANIC_ISLANDS &&
    3447             :              "FIXME: Need to emit flat atomics here");
    3448             : 
    3449          28 :       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
    3450          28 :       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
    3451          28 :       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
    3452          56 :       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
    3453             : 
    3454             :       // Atomics rith return have have an additional tied operand and are
    3455             :       // missing some of the special bits.
    3456          28 :       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
    3457             :       MachineInstr *Addr64;
    3458             : 
    3459          28 :       if (!VDataIn) {
    3460             :         // Regular buffer load / store.
    3461             :         MachineInstrBuilder MIB =
    3462          81 :             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    3463          27 :                 .add(*VData)
    3464          27 :                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
    3465             :                 // This will be replaced later
    3466             :                 // with the new value of vaddr.
    3467          27 :                 .add(*SRsrc)
    3468          27 :                 .add(*SOffset)
    3469          27 :                 .add(*Offset);
    3470             : 
    3471             :         // Atomics do not have this operand.
    3472          27 :         if (const MachineOperand *GLC =
    3473             :                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
    3474          26 :           MIB.addImm(GLC->getImm());
    3475             :         }
    3476             : 
    3477          81 :         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
    3478             : 
    3479          27 :         if (const MachineOperand *TFE =
    3480             :                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
    3481          26 :           MIB.addImm(TFE->getImm());
    3482             :         }
    3483             : 
    3484          81 :         MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3485          27 :         Addr64 = MIB;
    3486             :       } else {
    3487             :         // Atomics with return.
    3488           3 :         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
    3489           1 :                      .add(*VData)
    3490           1 :                      .add(*VDataIn)
    3491           1 :                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
    3492             :                      // This will be replaced later
    3493             :                      // with the new value of vaddr.
    3494           1 :                      .add(*SRsrc)
    3495           1 :                      .add(*SOffset)
    3496           1 :                      .add(*Offset)
    3497           3 :                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
    3498           3 :                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
    3499             :       }
    3500             : 
    3501          28 :       MI.removeFromParent();
    3502             : 
    3503             :       // NewVaddr = {NewVaddrHi, NewVaddrLo}
    3504          56 :       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
    3505          84 :               NewVAddr)
    3506          28 :           .addReg(SRsrcPtr, 0, AMDGPU::sub0)
    3507          28 :           .addImm(AMDGPU::sub0)
    3508          28 :           .addReg(SRsrcPtr, 0, AMDGPU::sub1)
    3509          28 :           .addImm(AMDGPU::sub1);
    3510             : 
    3511          28 :       VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
    3512          28 :       SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
    3513             :     }
    3514             : 
    3515             :     // Update the instruction to use NewVaddr
    3516          36 :     VAddr->setReg(NewVAddr);
    3517             :     // Update the instruction to use NewSRsrc
    3518          36 :     SRsrc->setReg(NewSRsrc);
    3519             :   }
    3520             : }
    3521             : 
    3522       31513 : void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
    3523       63026 :   SetVectorType Worklist;
    3524       31513 :   Worklist.insert(&TopInst);
    3525             : 
    3526      138263 :   while (!Worklist.empty()) {
    3527      106750 :     MachineInstr &Inst = *Worklist.pop_back_val();
    3528      106750 :     MachineBasicBlock *MBB = Inst.getParent();
    3529      106750 :     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    3530             : 
    3531      106750 :     unsigned Opcode = Inst.getOpcode();
    3532      106750 :     unsigned NewOpcode = getVALUOp(Inst);
    3533             : 
    3534             :     // Handle some special cases
    3535      108566 :     switch (Opcode) {
    3536             :     default:
    3537             :       break;
    3538         111 :     case AMDGPU::S_AND_B64:
    3539         111 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
    3540         111 :       Inst.eraseFromParent();
    3541         111 :       continue;
    3542             : 
    3543         216 :     case AMDGPU::S_OR_B64:
    3544         216 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
    3545         216 :       Inst.eraseFromParent();
    3546         216 :       continue;
    3547             : 
    3548         130 :     case AMDGPU::S_XOR_B64:
    3549         130 :       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
    3550         130 :       Inst.eraseFromParent();
    3551         130 :       continue;
    3552             : 
    3553          16 :     case AMDGPU::S_NOT_B64:
    3554          16 :       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
    3555          16 :       Inst.eraseFromParent();
    3556          16 :       continue;
    3557             : 
    3558          26 :     case AMDGPU::S_BCNT1_I32_B64:
    3559          26 :       splitScalar64BitBCNT(Worklist, Inst);
    3560          26 :       Inst.eraseFromParent();
    3561          26 :       continue;
    3562             : 
    3563        1247 :     case AMDGPU::S_BFE_I64:
    3564        1247 :       splitScalar64BitBFE(Worklist, Inst);
    3565        1247 :       Inst.eraseFromParent();
    3566        1247 :       continue;
    3567             : 
    3568        4410 :     case AMDGPU::S_LSHL_B32:
    3569        4410 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3570        2080 :         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
    3571        2080 :         swapOperands(Inst);
    3572             :       }
    3573             :       break;
    3574        1770 :     case AMDGPU::S_ASHR_I32:
    3575        1770 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3576         741 :         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
    3577         741 :         swapOperands(Inst);
    3578             :       }
    3579             :       break;
    3580        2691 :     case AMDGPU::S_LSHR_B32:
    3581        2691 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3582        1218 :         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
    3583        1218 :         swapOperands(Inst);
    3584             :       }
    3585             :       break;
    3586        1658 :     case AMDGPU::S_LSHL_B64:
    3587        1658 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3588         719 :         NewOpcode = AMDGPU::V_LSHLREV_B64;
    3589         719 :         swapOperands(Inst);
    3590             :       }
    3591             :       break;
    3592         213 :     case AMDGPU::S_ASHR_I64:
    3593         213 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3594          95 :         NewOpcode = AMDGPU::V_ASHRREV_I64;
    3595          95 :         swapOperands(Inst);
    3596             :       }
    3597             :       break;
    3598         138 :     case AMDGPU::S_LSHR_B64:
    3599         138 :       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    3600          69 :         NewOpcode = AMDGPU::V_LSHRREV_B64;
    3601          69 :         swapOperands(Inst);
    3602             :       }
    3603             :       break;
    3604             : 
    3605          16 :     case AMDGPU::S_ABS_I32:
    3606          16 :       lowerScalarAbs(Worklist, Inst);
    3607          16 :       Inst.eraseFromParent();
    3608          16 :       continue;
    3609             : 
    3610          77 :     case AMDGPU::S_CBRANCH_SCC0:
    3611             :     case AMDGPU::S_CBRANCH_SCC1:
    3612             :       // Clear unused bits of vcc
    3613         154 :       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
    3614         231 :               AMDGPU::VCC)
    3615          77 :           .addReg(AMDGPU::EXEC)
    3616          77 :           .addReg(AMDGPU::VCC);
    3617          77 :       break;
    3618             : 
    3619           0 :     case AMDGPU::S_BFE_U64:
    3620             :     case AMDGPU::S_BFM_B64:
    3621           0 :       llvm_unreachable("Moving this op to VALU not implemented");
    3622             : 
    3623          54 :     case AMDGPU::S_PACK_LL_B32_B16:
    3624             :     case AMDGPU::S_PACK_LH_B32_B16:
    3625             :     case AMDGPU::S_PACK_HH_B32_B16:
    3626          54 :       movePackToVALU(Worklist, MRI, Inst);
    3627          54 :       Inst.eraseFromParent();
    3628          54 :       continue;
    3629             :     }
    3630             : 
    3631      100070 :     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
    3632             :       // We cannot move this instruction to the VALU, so we should try to
    3633             :       // legalize its operands instead.
    3634          58 :       legalizeOperands(Inst);
    3635          58 :       continue;
    3636             :     }
    3637             : 
    3638             :     // Use the new VALU Opcode.
    3639      209752 :     const MCInstrDesc &NewDesc = get(NewOpcode);
    3640      209752 :     Inst.setDesc(NewDesc);
    3641             : 
    3642             :     // Remove any references to SCC. Vector instructions can't read from it, and
    3643             :     // We're just about to add the implicit use / defs of VCC, and we don't want
    3644             :     // both.
    3645      367541 :     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
    3646      525330 :       MachineOperand &Op = Inst.getOperand(i);
    3647      262665 :       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
    3648       38885 :         Inst.RemoveOperand(i);
    3649       38885 :         addSCCDefUsersToVALUWorklist(Inst, Worklist);
    3650             :       }
    3651             :     }
    3652             : 
    3653      104876 :     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
    3654             :       // We are converting these to a BFE, so we need to add the missing
    3655             :       // operands for the size and offset.
    3656        1077 :       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
    3657        1077 :       Inst.addOperand(MachineOperand::CreateImm(0));
    3658        2154 :       Inst.addOperand(MachineOperand::CreateImm(Size));
    3659             : 
    3660      103799 :     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
    3661             :       // The VALU version adds the second operand to the result, so insert an
    3662             :       // extra 0 operand.
    3663          64 :       Inst.addOperand(MachineOperand::CreateImm(0));
    3664             :     }
    3665             : 
    3666      104876 :     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
    3667             : 
    3668      104876 :     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
    3669        3471 :       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
    3670             :       // If we need to move this to VGPRs, we need to unpack the second operand
    3671             :       // back into the 2 separate ones for bit offset and width.
    3672             :       assert(OffsetWidthOp.isImm() &&
    3673             :              "Scalar BFE is only implemented for constant width and offset");
    3674        3471 :       uint32_t Imm = OffsetWidthOp.getImm();
    3675             : 
    3676        3471 :       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    3677        3471 :       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    3678        3471 :       Inst.RemoveOperand(2);                     // Remove old immediate.
    3679        6942 :       Inst.addOperand(MachineOperand::CreateImm(Offset));
    3680        6942 :       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
    3681             :     }
    3682             : 
    3683      314551 :     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
    3684      104722 :     unsigned NewDstReg = AMDGPU::NoRegister;
    3685             :     if (HasDst) {
    3686      104722 :       unsigned DstReg = Inst.getOperand(0).getReg();
    3687      104722 :       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
    3688          46 :         continue;
    3689             : 
    3690             :       // Update the destination register class.
    3691      104676 :       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
    3692      104676 :       if (!NewDstRC)
    3693           0 :         continue;
    3694             : 
    3695      167362 :       if (Inst.isCopy() &&
    3696      199127 :           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
    3697       47147 :           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
    3698             :         // Instead of creating a copy where src and dst are the same register
    3699             :         // class, we just replace all uses of dst with src.  These kinds of
    3700             :         // copies interfere with the heuristics MachineSink uses to decide
    3701             :         // whether or not to split a critical edge.  Since the pass assumes
    3702             :         // that copies will end up as machine instructions and not be
    3703             :         // eliminated.
    3704       15382 :         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
    3705       15382 :         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
    3706       15382 :         MRI.clearKillFlags(Inst.getOperand(1).getReg());
    3707       15382 :         Inst.getOperand(0).setReg(DstReg);
    3708       15382 :         continue;
    3709             :       }
    3710             : 
    3711       89294 :       NewDstReg = MRI.createVirtualRegister(NewDstRC);
    3712       89294 :       MRI.replaceRegWith(DstReg, NewDstReg);
    3713             :     }
    3714             : 
    3715             :     // Legalize the operands
    3716       89448 :     legalizeOperands(Inst);
    3717             : 
    3718       89448 :     if (HasDst)
    3719       89294 :      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
    3720             :   }
    3721       31513 : }
    3722             : 
    3723          16 : void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
    3724             :                                  MachineInstr &Inst) const {
    3725          16 :   MachineBasicBlock &MBB = *Inst.getParent();
    3726          16 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    3727          16 :   MachineBasicBlock::iterator MII = Inst;
    3728          48 :   DebugLoc DL = Inst.getDebugLoc();
    3729             : 
    3730          16 :   MachineOperand &Dest = Inst.getOperand(0);
    3731          16 :   MachineOperand &Src = Inst.getOperand(1);
    3732          16 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3733          16 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3734             : 
    3735          48 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
    3736          16 :     .addImm(0)
    3737          16 :     .addReg(Src.getReg());
    3738             : 
    3739          48 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
    3740          16 :     .addReg(Src.getReg())
    3741          16 :     .addReg(TmpReg);
    3742             : 
    3743          16 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    3744          16 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    3745          16 : }
    3746             : 
    3747          16 : void SIInstrInfo::splitScalar64BitUnaryOp(
    3748             :     SetVectorType &Worklist, MachineInstr &Inst,
    3749             :     unsigned Opcode) const {
    3750          16 :   MachineBasicBlock &MBB = *Inst.getParent();
    3751          16 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    3752             : 
    3753          16 :   MachineOperand &Dest = Inst.getOperand(0);
    3754          32 :   MachineOperand &Src0 = Inst.getOperand(1);
    3755          48 :   DebugLoc DL = Inst.getDebugLoc();
    3756             : 
    3757          16 :   MachineBasicBlock::iterator MII = Inst;
    3758             : 
    3759          32 :   const MCInstrDesc &InstDesc = get(Opcode);
    3760          32 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    3761          16 :     MRI.getRegClass(Src0.getReg()) :
    3762          16 :     &AMDGPU::SGPR_32RegClass;
    3763             : 
    3764          16 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    3765             : 
    3766             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    3767          16 :                                                        AMDGPU::sub0, Src0SubRC);
    3768             : 
    3769          32 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    3770          16 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    3771          16 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    3772             : 
    3773          16 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    3774          32 :   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
    3775             : 
    3776             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    3777          16 :                                                        AMDGPU::sub1, Src0SubRC);
    3778             : 
    3779          16 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    3780          32 :   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
    3781             : 
    3782          16 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    3783          48 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    3784          16 :     .addReg(DestSub0)
    3785          16 :     .addImm(AMDGPU::sub0)
    3786          16 :     .addReg(DestSub1)
    3787          16 :     .addImm(AMDGPU::sub1);
    3788             : 
    3789          16 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    3790             : 
    3791             :   // We don't need to legalizeOperands here because for a single operand, src0
    3792             :   // will support any kind of input.
    3793             : 
    3794             :   // Move all users of this moved value.
    3795          16 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    3796          16 : }
    3797             : 
    3798         457 : void SIInstrInfo::splitScalar64BitBinaryOp(
    3799             :     SetVectorType &Worklist, MachineInstr &Inst,
    3800             :     unsigned Opcode) const {
    3801         457 :   MachineBasicBlock &MBB = *Inst.getParent();
    3802         457 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    3803             : 
    3804         457 :   MachineOperand &Dest = Inst.getOperand(0);
    3805         914 :   MachineOperand &Src0 = Inst.getOperand(1);
    3806         914 :   MachineOperand &Src1 = Inst.getOperand(2);
    3807        1371 :   DebugLoc DL = Inst.getDebugLoc();
    3808             : 
    3809         457 :   MachineBasicBlock::iterator MII = Inst;
    3810             : 
    3811         914 :   const MCInstrDesc &InstDesc = get(Opcode);
    3812         914 :   const TargetRegisterClass *Src0RC = Src0.isReg() ?
    3813         457 :     MRI.getRegClass(Src0.getReg()) :
    3814         457 :     &AMDGPU::SGPR_32RegClass;
    3815             : 
    3816         457 :   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
    3817         914 :   const TargetRegisterClass *Src1RC = Src1.isReg() ?
    3818         457 :     MRI.getRegClass(Src1.getReg()) :
    3819         457 :     &AMDGPU::SGPR_32RegClass;
    3820             : 
    3821         457 :   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
    3822             : 
    3823             :   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    3824         457 :                                                        AMDGPU::sub0, Src0SubRC);
    3825             :   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    3826         457 :                                                        AMDGPU::sub0, Src1SubRC);
    3827             : 
    3828         914 :   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
    3829         457 :   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
    3830         457 :   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
    3831             : 
    3832         457 :   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
    3833         457 :   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
    3834         457 :                               .add(SrcReg0Sub0)
    3835         457 :                               .add(SrcReg1Sub0);
    3836             : 
    3837             :   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
    3838         457 :                                                        AMDGPU::sub1, Src0SubRC);
    3839             :   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
    3840         457 :                                                        AMDGPU::sub1, Src1SubRC);
    3841             : 
    3842         457 :   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
    3843         457 :   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
    3844         457 :                               .add(SrcReg0Sub1)
    3845         457 :                               .add(SrcReg1Sub1);
    3846             : 
    3847         457 :   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
    3848        1371 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
    3849         457 :     .addReg(DestSub0)
    3850         457 :     .addImm(AMDGPU::sub0)
    3851         457 :     .addReg(DestSub1)
    3852         457 :     .addImm(AMDGPU::sub1);
    3853             : 
    3854         457 :   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
    3855             : 
    3856             :   // Try to legalize the operands in case we need to swap the order to keep it
    3857             :   // valid.
    3858         457 :   legalizeOperands(LoHalf);
    3859         457 :   legalizeOperands(HiHalf);
    3860             : 
    3861             :   // Move all users of this moved vlaue.
    3862         457 :   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
    3863         457 : }
    3864             : 
    3865          26 : void SIInstrInfo::splitScalar64BitBCNT(
    3866             :     SetVectorType &Worklist, MachineInstr &Inst) const {
    3867          26 :   MachineBasicBlock &MBB = *Inst.getParent();
    3868          26 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    3869             : 
    3870          26 :   MachineBasicBlock::iterator MII = Inst;
    3871          78 :   DebugLoc DL = Inst.getDebugLoc();
    3872             : 
    3873          26 :   MachineOperand &Dest = Inst.getOperand(0);
    3874          52 :   MachineOperand &Src = Inst.getOperand(1);
    3875             : 
    3876          52 :   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
    3877          52 :   const TargetRegisterClass *SrcRC = Src.isReg() ?
    3878          26 :     MRI.getRegClass(Src.getReg()) :
    3879          26 :     &AMDGPU::SGPR_32RegClass;
    3880             : 
    3881          26 :   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3882          26 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3883             : 
    3884          26 :   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
    3885             : 
    3886             :   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    3887          26 :                                                       AMDGPU::sub0, SrcSubRC);
    3888             :   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
    3889          26 :                                                       AMDGPU::sub1, SrcSubRC);
    3890             : 
    3891          78 :   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
    3892             : 
    3893          52 :   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
    3894             : 
    3895          26 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    3896             : 
    3897             :   // We don't need to legalize operands here. src0 for etiher instruction can be
    3898             :   // an SGPR, and the second input is unused or determined here.
    3899          26 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    3900          26 : }
    3901             : 
    3902        1247 : void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
    3903             :                                       MachineInstr &Inst) const {
    3904        1247 :   MachineBasicBlock &MBB = *Inst.getParent();
    3905        1247 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    3906        1247 :   MachineBasicBlock::iterator MII = Inst;
    3907        2500 :   DebugLoc DL = Inst.getDebugLoc();
    3908             : 
    3909        1247 :   MachineOperand &Dest = Inst.getOperand(0);
    3910        1247 :   uint32_t Imm = Inst.getOperand(2).getImm();
    3911        1247 :   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
    3912        1247 :   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
    3913             : 
    3914             :   (void) Offset;
    3915             : 
    3916             :   // Only sext_inreg cases handled.
    3917             :   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
    3918             :          Offset == 0 && "Not implemented");
    3919             : 
    3920        1247 :   if (BitWidth < 32) {
    3921        1241 :     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3922        1241 :     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3923        1241 :     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    3924             : 
    3925        3723 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
    3926        1241 :         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
    3927        1241 :         .addImm(0)
    3928        2482 :         .addImm(BitWidth);
    3929             : 
    3930        3723 :     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
    3931        1241 :       .addImm(31)
    3932        1241 :       .addReg(MidRegLo);
    3933             : 
    3934        3723 :     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    3935        1241 :       .addReg(MidRegLo)
    3936        1241 :       .addImm(AMDGPU::sub0)
    3937        1241 :       .addReg(MidRegHi)
    3938        1241 :       .addImm(AMDGPU::sub1);
    3939             : 
    3940        1241 :     MRI.replaceRegWith(Dest.getReg(), ResultReg);
    3941        1241 :     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    3942        1241 :     return;
    3943             :   }
    3944             : 
    3945           6 :   MachineOperand &Src = Inst.getOperand(1);
    3946           6 :   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3947           6 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
    3948             : 
    3949          18 :   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
    3950           6 :     .addImm(31)
    3951           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0);
    3952             : 
    3953          18 :   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
    3954           6 :     .addReg(Src.getReg(), 0, AMDGPU::sub0)
    3955           6 :     .addImm(AMDGPU::sub0)
    3956           6 :     .addReg(TmpReg)
    3957           6 :     .addImm(AMDGPU::sub1);
    3958             : 
    3959           6 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    3960           6 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    3961             : }
    3962             : 
    3963      106492 : void SIInstrInfo::addUsersToMoveToVALUWorklist(
    3964             :   unsigned DstReg,
    3965             :   MachineRegisterInfo &MRI,
    3966             :   SetVectorType &Worklist) const {
    3967      106492 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
    3968      350005 :          E = MRI.use_end(); I != E;) {
    3969      137021 :     MachineInstr &UseMI = *I->getParent();
    3970      137021 :     if (!canReadVGPR(UseMI, I.getOperandNo())) {
    3971       75170 :       Worklist.insert(&UseMI);
    3972             : 
    3973             :       do {
    3974       75303 :         ++I;
    3975       75303 :       } while (I != E && I->getParent() == &UseMI);
    3976             :     } else {
    3977             :       ++I;
    3978             :     }
    3979             :   }
    3980      106492 : }
    3981             : 
    3982          54 : void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
    3983             :                                  MachineRegisterInfo &MRI,
    3984             :                                  MachineInstr &Inst) const {
    3985          54 :   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3986          54 :   MachineBasicBlock *MBB = Inst.getParent();
    3987         108 :   MachineOperand &Src0 = Inst.getOperand(1);
    3988         108 :   MachineOperand &Src1 = Inst.getOperand(2);
    3989          54 :   const DebugLoc &DL = Inst.getDebugLoc();
    3990             : 
    3991         108 :   switch (Inst.getOpcode()) {
    3992          51 :   case AMDGPU::S_PACK_LL_B32_B16: {
    3993          51 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3994          51 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    3995             : 
    3996             :     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
    3997             :     // 0.
    3998         153 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    3999          51 :       .addImm(0xffff);
    4000             : 
    4001         153 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
    4002          51 :       .addReg(ImmReg, RegState::Kill)
    4003          51 :       .add(Src0);
    4004             : 
    4005         153 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
    4006          51 :       .add(Src1)
    4007          51 :       .addImm(16)
    4008          51 :       .addReg(TmpReg, RegState::Kill);
    4009          51 :     break;
    4010             :   }
    4011           2 :   case AMDGPU::S_PACK_LH_B32_B16: {
    4012           2 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4013           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4014           2 :       .addImm(0xffff);
    4015           6 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
    4016           2 :       .addReg(ImmReg, RegState::Kill)
    4017           2 :       .add(Src0)
    4018           2 :       .add(Src1);
    4019           2 :     break;
    4020             :   }
    4021           1 :   case AMDGPU::S_PACK_HH_B32_B16: {
    4022           1 :     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4023           1 :     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
    4024           2 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
    4025           1 :       .addImm(16)
    4026           1 :       .add(Src0);
    4027           3 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
    4028           1 :       .addImm(0xffff0000);
    4029           3 :     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
    4030           1 :       .add(Src1)
    4031           1 :       .addReg(ImmReg, RegState::Kill)
    4032           1 :       .addReg(TmpReg, RegState::Kill);
    4033           1 :     break;
    4034             :   }
    4035           0 :   default:
    4036           0 :     llvm_unreachable("unhandled s_pack_* instruction");
    4037             :   }
    4038             : 
    4039          54 :   MachineOperand &Dest = Inst.getOperand(0);
    4040          54 :   MRI.replaceRegWith(Dest.getReg(), ResultReg);
    4041          54 :   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
    4042          54 : }
    4043             : 
    4044       38885 : void SIInstrInfo::addSCCDefUsersToVALUWorklist(
    4045             :     MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
    4046             :   // This assumes that all the users of SCC are in the same block
    4047             :   // as the SCC def.
    4048             :   for (MachineInstr &MI :
    4049             :        make_range(MachineBasicBlock::iterator(SCCDefInst),
    4050     3713617 :                       SCCDefInst.getParent()->end())) {
    4051             :     // Exit if we find another SCC def.
    4052     1774543 :     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
    4053       29894 :       return;
    4054             : 
    4055     1744649 :     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
    4056          77 :       Worklist.insert(&MI);
    4057             :   }
    4058             : }
    4059             : 
    4060      104676 : const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    4061             :   const MachineInstr &Inst) const {
    4062      104676 :   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
    4063             : 
    4064      209352 :   switch (Inst.getOpcode()) {
    4065             :   // For target instructions, getOpRegClass just returns the virtual register
    4066             :   // class associated with the operand, so we need to find an equivalent VGPR
    4067             :   // register class in order to move the instruction to the VALU.
    4068       68989 :   case AMDGPU::COPY:
    4069             :   case AMDGPU::PHI:
    4070             :   case AMDGPU::REG_SEQUENCE:
    4071             :   case AMDGPU::INSERT_SUBREG:
    4072             :   case AMDGPU::WQM:
    4073             :   case AMDGPU::WWM:
    4074       68989 :     if (RI.hasVGPRs(NewDstRC))
    4075             :       return nullptr;
    4076             : 
    4077       68989 :     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
    4078       68989 :     if (!NewDstRC)
    4079             :       return nullptr;
    4080       68989 :     return NewDstRC;
    4081             :   default:
    4082             :     return NewDstRC;
    4083             :   }
    4084             : }
    4085             : 
    4086             : // Find the one SGPR operand we are allowed to use.
    4087       46997 : unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
    4088             :                                    int OpIndices[3]) const {
    4089       46997 :   const MCInstrDesc &Desc = MI.getDesc();
    4090             : 
    4091             :   // Find the one SGPR operand we are allowed to use.
    4092             :   //
    4093             :   // First we need to consider the instruction's operand requirements before
    4094             :   // legalizing. Some operands are required to be SGPRs, such as implicit uses
    4095             :   // of VCC, but we are still bound by the constant bus requirement to only use
    4096             :   // one.
    4097             :   //
    4098             :   // If the operand's class is an SGPR, we can never move it.
    4099             : 
    4100       46997 :   unsigned SGPRReg = findImplicitSGPRRead(MI);
    4101       46997 :   if (SGPRReg != AMDGPU::NoRegister)
    4102             :     return SGPRReg;
    4103             : 
    4104       46874 :   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
    4105       46874 :   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
    4106             : 
    4107      145143 :   for (unsigned i = 0; i < 3; ++i) {
    4108      136966 :     int Idx = OpIndices[i];
    4109      136966 :     if (Idx == -1)
    4110             :       break;
    4111             : 
    4112      209444 :     const MachineOperand &MO = MI.getOperand(Idx);
    4113      104722 :     if (!MO.isReg())
    4114       10827 :       continue;
    4115             : 
    4116             :     // Is this operand statically required to be an SGPR based on the operand
    4117             :     // constraints?
    4118      187790 :     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
    4119      187790 :     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
    4120       93895 :     if (IsRequiredSGPR)
    4121        6453 :       return MO.getReg();
    4122             : 
    4123             :     // If this could be a VGPR or an SGPR, Check the dynamic register class.
    4124       87442 :     unsigned Reg = MO.getReg();
    4125       87442 :     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
    4126      174884 :     if (RI.isSGPRClass(RegRC))
    4127       41360 :       UsedSGPRs[i] = Reg;
    4128             :   }
    4129             : 
    4130             :   // We don't have a required SGPR operand, so we have a bit more freedom in
    4131             :   // selecting operands to move.
    4132             : 
    4133             :   // Try to select the most used SGPR. If an SGPR is equal to one of the
    4134             :   // others, we choose that.
    4135             :   //
    4136             :   // e.g.
    4137             :   // V_FMA_F32 v0, s0, s0, s0 -> No moves
    4138             :   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
    4139             : 
    4140             :   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
    4141             :   // prefer those.
    4142             : 
    4143       40421 :   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
    4144       17007 :     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
    4145         351 :       SGPRReg = UsedSGPRs[0];
    4146             :   }
    4147             : 
    4148       40421 :   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
    4149       13731 :     if (UsedSGPRs[1] == UsedSGPRs[2])
    4150           8 :       SGPRReg = UsedSGPRs[1];
    4151             :   }
    4152             : 
    4153             :   return SGPRReg;
    4154             : }
    4155             : 
    4156     2513669 : MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
    4157             :                                              unsigned OperandName) const {
    4158     5027338 :   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
    4159     2513669 :   if (Idx == -1)
    4160             :     return nullptr;
    4161             : 
    4162     3823150 :   return &MI.getOperand(Idx);
    4163             : }
    4164             : 
    4165       22495 : uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
    4166       22495 :   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
    4167       44990 :   if (ST.isAmdHsaOS()) {
    4168             :     // Set ATC = 1. GFX9 doesn't have this bit.
    4169         436 :     if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
    4170         341 :       RsrcDataFormat |= (1ULL << 56);
    4171             : 
    4172             :     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
    4173             :     // BTW, it disables TC L2 and therefore decreases performance.
    4174         436 :     if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
    4175         236 :       RsrcDataFormat |= (2ULL << 59);
    4176             :   }
    4177             : 
    4178       22495 :   return RsrcDataFormat;
    4179             : }
    4180             : 
    4181         463 : uint64_t SIInstrInfo::getScratchRsrcWords23() const {
    4182         463 :   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
    4183             :                     AMDGPU::RSRC_TID_ENABLE |
    4184         463 :                     0xffffffff; // Size;
    4185             : 
    4186             :   // GFX9 doesn't have ELEMENT_SIZE.
    4187         463 :   if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
    4188         794 :     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
    4189         397 :     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
    4190             :   }
    4191             : 
    4192             :   // IndexStride = 64.
    4193         463 :   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
    4194             : 
    4195             :   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
    4196             :   // Clear them unless we want a huge stride.
    4197         463 :   if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
    4198         232 :     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
    4199             : 
    4200         463 :   return Rsrc23;
    4201             : }
    4202             : 
    4203          60 : bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
    4204         120 :   unsigned Opc = MI.getOpcode();
    4205             : 
    4206         120 :   return isSMRD(Opc);
    4207             : }
    4208             : 
    4209          14 : bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
    4210          28 :   unsigned Opc = MI.getOpcode();
    4211             : 
    4212          56 :   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
    4213             : }
    4214             : 
    4215        2783 : unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
    4216             :                                     int &FrameIndex) const {
    4217        2783 :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
    4218        5499 :   if (!Addr || !Addr->isFI())
    4219             :     return AMDGPU::NoRegister;
    4220             : 
    4221             :   assert(!MI.memoperands_empty() &&
    4222             :          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
    4223             : 
    4224        2059 :   FrameIndex = Addr->getIndex();
    4225        2059 :   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
    4226             : }
    4227             : 
    4228          57 : unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
    4229             :                                         int &FrameIndex) const {
    4230          57 :   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
    4231             :   assert(Addr && Addr->isFI());
    4232          57 :   FrameIndex = Addr->getIndex();
    4233          57 :   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
    4234             : }
    4235             : 
    4236       13212 : unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
    4237             :                                           int &FrameIndex) const {
    4238       13212 :   if (!MI.mayLoad())
    4239             :     return AMDGPU::NoRegister;
    4240             : 
    4241        3045 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4242        1404 :     return isStackAccess(MI, FrameIndex);
    4243             : 
    4244         815 :   if (isSGPRSpill(MI))
    4245          57 :     return isSGPRStackAccess(MI, FrameIndex);
    4246             : 
    4247             :   return AMDGPU::NoRegister;
    4248             : }
    4249             : 
    4250        8026 : unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
    4251             :                                          int &FrameIndex) const {
    4252        8026 :   if (!MI.mayStore())
    4253             :     return AMDGPU::NoRegister;
    4254             : 
    4255        2235 :   if (isMUBUF(MI) || isVGPRSpill(MI))
    4256        1379 :     return isStackAccess(MI, FrameIndex);
    4257             : 
    4258         428 :   if (isSGPRSpill(MI))
    4259           0 :     return isSGPRStackAccess(MI, FrameIndex);
    4260             : 
    4261             :   return AMDGPU::NoRegister;
    4262             : }
    4263             : 
    4264      620722 : unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
    4265     1241444 :   unsigned Opc = MI.getOpcode();
    4266      620722 :   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
    4267      620722 :   unsigned DescSize = Desc.getSize();
    4268             : 
    4269             :   // If we have a definitive size, we can use it. Otherwise we need to inspect
    4270             :   // the operands to know the size.
    4271             :   //
    4272             :   // FIXME: Instructions that have a base 32-bit encoding report their size as
    4273             :   // 4, even though they are really 8 bytes if they have a literal operand.
    4274      620722 :   if (DescSize != 0 && DescSize != 4)
    4275             :     return DescSize;
    4276             : 
    4277             :   // 4-byte instructions may have a 32-bit literal encoded after them. Check
    4278             :   // operands that coud ever be literals.
    4279      710730 :   if (isVALU(MI) || isSALU(MI)) {
    4280      409603 :     if (isFixedSize(MI))
    4281             :       return DescSize;
    4282             : 
    4283      408602 :     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    4284      408602 :     if (Src0Idx == -1)
    4285             :       return 4; // No operands.
    4286             : 
    4287      588748 :     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
    4288             :       return 8;
    4289             : 
    4290      264941 :     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    4291      264941 :     if (Src1Idx == -1)
    4292             :       return 4;
    4293             : 
    4294      244162 :     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
    4295             :       return 8;
    4296             : 
    4297      112717 :     return 4;
    4298             :   }
    4299             : 
    4300       38407 :   if (DescSize == 4)
    4301             :     return 4;
    4302             : 
    4303             :   switch (Opc) {
    4304             :   case TargetOpcode::IMPLICIT_DEF:
    4305             :   case TargetOpcode::KILL:
    4306             :   case TargetOpcode::DBG_VALUE:
    4307             :   case TargetOpcode::BUNDLE:
    4308             :   case TargetOpcode::EH_LABEL:
    4309             :     return 0;
    4310        2633 :   case TargetOpcode::INLINEASM: {
    4311        2633 :     const MachineFunction *MF = MI.getParent()->getParent();
    4312        2633 :     const char *AsmStr = MI.getOperand(0).getSymbolName();
    4313        2633 :     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
    4314             :   }
    4315           0 :   default:
    4316           0 :     llvm_unreachable("unable to find instruction size");
    4317             :   }
    4318             : }
    4319             : 
    4320          90 : bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
    4321          90 :   if (!isFLAT(MI))
    4322             :     return false;
    4323             : 
    4324          11 :   if (MI.memoperands_empty())
    4325             :     return true;
    4326             : 
    4327           8 :   for (const MachineMemOperand *MMO : MI.memoperands()) {
    4328           5 :     if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
    4329             :       return true;
    4330             :   }
    4331             :   return false;
    4332             : }
    4333             : 
    4334           0 : bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
    4335           0 :   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
    4336             : }
    4337             : 
    4338           0 : void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
    4339             :                                             MachineBasicBlock *IfEnd) const {
    4340           0 :   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
    4341             :   assert(TI != IfEntry->end());
    4342             : 
    4343           0 :   MachineInstr *Branch = &(*TI);
    4344           0 :   MachineFunction *MF = IfEntry->getParent();
    4345           0 :   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
    4346             : 
    4347           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    4348           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4349             :     MachineInstr *SIIF =
    4350           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
    4351           0 :             .add(Branch->getOperand(0))
    4352           0 :             .add(Branch->getOperand(1));
    4353             :     MachineInstr *SIEND =
    4354           0 :         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
    4355           0 :             .addReg(DstReg);
    4356             : 
    4357           0 :     IfEntry->erase(TI);
    4358           0 :     IfEntry->insert(IfEntry->end(), SIIF);
    4359           0 :     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
    4360             :   }
    4361           0 : }
    4362             : 
    4363           0 : void SIInstrInfo::convertNonUniformLoopRegion(
    4364             :     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
    4365           0 :   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
    4366             :   // We expect 2 terminators, one conditional and one unconditional.
    4367             :   assert(TI != LoopEnd->end());
    4368             : 
    4369           0 :   MachineInstr *Branch = &(*TI);
    4370           0 :   MachineFunction *MF = LoopEnd->getParent();
    4371           0 :   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
    4372             : 
    4373           0 :   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
    4374             : 
    4375           0 :     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4376           0 :     unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4377             :     MachineInstrBuilder HeaderPHIBuilder =
    4378           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
    4379           0 :     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
    4380           0 :                                           E = LoopEntry->pred_end();
    4381           0 :          PI != E; ++PI) {
    4382           0 :       if (*PI == LoopEnd) {
    4383           0 :         HeaderPHIBuilder.addReg(BackEdgeReg);
    4384             :       } else {
    4385           0 :         MachineBasicBlock *PMBB = *PI;
    4386           0 :         unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4387           0 :         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
    4388             :                              ZeroReg, 0);
    4389           0 :         HeaderPHIBuilder.addReg(ZeroReg);
    4390             :       }
    4391           0 :       HeaderPHIBuilder.addMBB(*PI);
    4392             :     }
    4393           0 :     MachineInstr *HeaderPhi = HeaderPHIBuilder;
    4394           0 :     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
    4395           0 :                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
    4396           0 :                                   .addReg(DstReg)
    4397           0 :                                   .add(Branch->getOperand(0));
    4398             :     MachineInstr *SILOOP =
    4399           0 :         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
    4400           0 :             .addReg(BackEdgeReg)
    4401           0 :             .addMBB(LoopEntry);
    4402             : 
    4403           0 :     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
    4404           0 :     LoopEnd->erase(TI);
    4405           0 :     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
    4406           0 :     LoopEnd->insert(LoopEnd->end(), SILOOP);
    4407             :   }
    4408           0 : }
    4409             : 
    4410             : ArrayRef<std::pair<int, const char *>>
    4411           5 : SIInstrInfo::getSerializableTargetIndices() const {
    4412             :   static const std::pair<int, const char *> TargetIndices[] = {
    4413             :       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
    4414             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
    4415             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
    4416             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
    4417             :       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
    4418           5 :   return makeArrayRef(TargetIndices);
    4419             : }
    4420             : 
    4421             : /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
    4422             : /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
    4423             : ScheduleHazardRecognizer *
    4424       11258 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
    4425             :                                             const ScheduleDAG *DAG) const {
    4426       11258 :   return new GCNHazardRecognizer(DAG->MF);
    4427             : }
    4428             : 
    4429             : /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
    4430             : /// pass.
    4431             : ScheduleHazardRecognizer *
    4432       14880 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
    4433       14880 :   return new GCNHazardRecognizer(MF);
    4434             : }
    4435             : 
    4436             : std::pair<unsigned, unsigned>
    4437           3 : SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
    4438           6 :   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
    4439             : }
    4440             : 
    4441             : ArrayRef<std::pair<unsigned, const char *>>
    4442           4 : SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
    4443             :   static const std::pair<unsigned, const char *> TargetFlags[] = {
    4444             :     { MO_GOTPCREL, "amdgpu-gotprel" },
    4445             :     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
    4446             :     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
    4447             :     { MO_REL32_LO, "amdgpu-rel32-lo" },
    4448             :     { MO_REL32_HI, "amdgpu-rel32-hi" }
    4449             :   };
    4450             : 
    4451           4 :   return makeArrayRef(TargetFlags);
    4452             : }
    4453             : 
    4454         741 : bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
    4455        1970 :   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
    4456        1793 :          MI.modifiesRegister(AMDGPU::EXEC, &RI);
    4457             : }
    4458             : 
    4459             : MachineInstrBuilder
    4460           0 : SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
    4461             :                            MachineBasicBlock::iterator I,
    4462             :                            const DebugLoc &DL,
    4463             :                            unsigned DestReg) const {
    4464           0 :   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    4465             : 
    4466           0 :   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    4467             : 
    4468           0 :   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
    4469           0 :            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
    4470      216918 : }

Generated by: LCOV version 1.13