LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaits.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 171 208 82.2 %
Date: 2018-05-09 00:02:34 Functions: 13 16 81.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/SmallVector.h"
      27             : #include "llvm/ADT/StringRef.h"
      28             : #include "llvm/CodeGen/MachineBasicBlock.h"
      29             : #include "llvm/CodeGen/MachineFunction.h"
      30             : #include "llvm/CodeGen/MachineFunctionPass.h"
      31             : #include "llvm/CodeGen/MachineInstr.h"
      32             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      33             : #include "llvm/CodeGen/MachineOperand.h"
      34             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      35             : #include "llvm/IR/DebugLoc.h"
      36             : #include "llvm/MC/MCInstrDesc.h"
      37             : #include "llvm/Pass.h"
      38             : #include "llvm/Support/Debug.h"
      39             : #include "llvm/Support/raw_ostream.h"
      40             : #include <algorithm>
      41             : #include <cassert>
      42             : #include <cstdint>
      43             : #include <cstring>
      44             : #include <utility>
      45             : 
      46             : #define DEBUG_TYPE "si-insert-waits"
      47             : 
      48             : using namespace llvm;
      49             : 
      50             : namespace {
      51             : 
      52             : /// One variable for each of the hardware counters
      53             : using Counters = union {
      54             :   struct {
      55             :     unsigned VM;
      56             :     unsigned EXP;
      57             :     unsigned LGKM;
      58             :   } Named;
      59             :   unsigned Array[3];
      60             : };
      61             : 
      62             : using InstType = enum {
      63             :   OTHER,
      64             :   SMEM,
      65             :   VMEM
      66             : };
      67             : 
      68             : using RegCounters =  Counters[512];
      69             : using RegInterval = std::pair<unsigned, unsigned>;
      70             : 
      71           2 : class SIInsertWaits : public MachineFunctionPass {
      72             : private:
      73             :   const SISubtarget *ST = nullptr;
      74             :   const SIInstrInfo *TII = nullptr;
      75             :   const SIRegisterInfo *TRI = nullptr;
      76             :   const MachineRegisterInfo *MRI;
      77             :   AMDGPU::IsaInfo::IsaVersion ISA;
      78             : 
      79             :   /// Constant zero value
      80             :   static const Counters ZeroCounts;
      81             : 
      82             :   /// Hardware limits
      83             :   Counters HardwareLimits;
      84             : 
      85             :   /// Counter values we have already waited on.
      86             :   Counters WaitedOn;
      87             : 
      88             :   /// Counter values that we must wait on before the next counter
      89             :   /// increase.
      90             :   Counters DelayedWaitOn;
      91             : 
      92             :   /// Counter values for last instruction issued.
      93             :   Counters LastIssued;
      94             : 
      95             :   /// Registers used by async instructions.
      96             :   RegCounters UsedRegs;
      97             : 
      98             :   /// Registers defined by async instructions.
      99             :   RegCounters DefinedRegs;
     100             : 
     101             :   /// Different export instruction types seen since last wait.
     102             :   unsigned ExpInstrTypesSeen = 0;
     103             : 
     104             :   /// Type of the last opcode.
     105             :   InstType LastOpcodeType;
     106             : 
     107             :   bool LastInstWritesM0;
     108             : 
     109             :   /// Whether or not we have flat operations outstanding.
     110             :   bool IsFlatOutstanding;
     111             : 
     112             :   /// Whether the machine function returns void
     113             :   bool ReturnsVoid;
     114             : 
     115             :   /// Whether the VCCZ bit is possibly corrupt
     116             :   bool VCCZCorrupt = false;
     117             : 
     118             :   /// Get increment/decrement amount for this instruction.
     119             :   Counters getHwCounts(MachineInstr &MI);
     120             : 
     121             :   /// Is operand relevant for async execution?
     122             :   bool isOpRelevant(MachineOperand &Op);
     123             : 
     124             :   /// Get register interval an operand affects.
     125             :   RegInterval getRegInterval(const TargetRegisterClass *RC,
     126             :                              const MachineOperand &Reg) const;
     127             : 
     128             :   /// Handle instructions async components
     129             :   void pushInstruction(MachineBasicBlock &MBB,
     130             :                        MachineBasicBlock::iterator I,
     131             :                        const Counters& Increment);
     132             : 
     133             :   /// Insert the actual wait instruction
     134             :   bool insertWait(MachineBasicBlock &MBB,
     135             :                   MachineBasicBlock::iterator I,
     136             :                   const Counters &Counts);
     137             : 
     138             :   /// Handle existing wait instructions (from intrinsics)
     139             :   void handleExistingWait(MachineBasicBlock::iterator I);
     140             : 
     141             :   /// Do we need def2def checks?
     142             :   bool unorderedDefines(MachineInstr &MI);
     143             : 
     144             :   /// Resolve all operand dependencies to counter requirements
     145             :   Counters handleOperands(MachineInstr &MI);
     146             : 
     147             :   /// Insert S_NOP between an instruction writing M0 and S_SENDMSG.
     148             :   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
     149             : 
     150             :   /// Return true if there are LGKM instrucitons that haven't been waited on
     151             :   /// yet.
     152             :   bool hasOutstandingLGKM() const;
     153             : 
     154             : public:
     155             :   static char ID;
     156             : 
     157           2 :   SIInsertWaits() : MachineFunctionPass(ID) {}
     158             : 
     159             :   bool runOnMachineFunction(MachineFunction &MF) override;
     160             : 
     161           2 :   StringRef getPassName() const override {
     162           2 :     return "SI insert wait instructions";
     163             :   }
     164             : 
     165           2 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     166           2 :     AU.setPreservesCFG();
     167           2 :     MachineFunctionPass::getAnalysisUsage(AU);
     168           2 :   }
     169             : };
     170             : 
     171             : } // end anonymous namespace
     172             : 
     173       74724 : INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
     174             :                       "SI Insert Waits", false, false)
     175      350216 : INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
     176             :                     "SI Insert Waits", false, false)
     177             : 
     178             : char SIInsertWaits::ID = 0;
     179             : 
     180             : char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
     181             : 
     182           0 : FunctionPass *llvm::createSIInsertWaitsPass() {
     183           0 :   return new SIInsertWaits();
     184             : }
     185             : 
     186             : const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
     187             : 
     188             : static bool readsVCCZ(const MachineInstr &MI) {
     189          32 :   unsigned Opc = MI.getOpcode();
     190          34 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     191           2 :          !MI.getOperand(1).isUndef();
     192             : }
     193             : 
     194             : bool SIInsertWaits::hasOutstandingLGKM() const {
     195             :   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
     196             : }
     197             : 
     198          55 : Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
     199          55 :   uint64_t TSFlags = MI.getDesc().TSFlags;
     200             :   Counters Result = { { 0, 0, 0 } };
     201             : 
     202          55 :   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
     203             : 
     204             :   // Only consider stores or EXP for EXP_CNT
     205          55 :   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
     206             : 
     207             :   // LGKM may uses larger values
     208          55 :   if (TSFlags & SIInstrFlags::LGKM_CNT) {
     209             : 
     210          14 :     if (TII->isSMRD(MI)) {
     211             : 
     212           3 :       if (MI.getNumOperands() != 0) {
     213             :         assert(MI.getOperand(0).isReg() &&
     214             :                "First LGKM operand must be a register!");
     215             : 
     216             :         // XXX - What if this is a write into a super register?
     217           3 :         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
     218           3 :         unsigned Size = TRI->getRegSizeInBits(*RC);
     219           3 :         Result.Named.LGKM = Size > 32 ? 2 : 1;
     220             :       } else {
     221             :         // s_dcache_inv etc. do not have a destination register. Assume we
     222             :         // want a wait on these.
     223             :         // XXX - What is the right value?
     224             :         Result.Named.LGKM = 1;
     225             :       }
     226             :     } else {
     227             :       // DS
     228             :       Result.Named.LGKM = 1;
     229             :     }
     230             : 
     231             :   } else {
     232             :     Result.Named.LGKM = 0;
     233             :   }
     234             : 
     235          55 :   return Result;
     236             : }
     237             : 
     238         137 : bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
     239             :   // Constants are always irrelevant
     240         205 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     241             :     return false;
     242             : 
     243             :   // Defines are always relevant
     244          68 :   if (Op.isDef())
     245             :     return true;
     246             : 
     247             :   // For exports all registers are relevant.
     248             :   // TODO: Skip undef/disabled registers.
     249          57 :   MachineInstr &MI = *Op.getParent();
     250          57 :   if (TII->isEXP(MI))
     251             :     return true;
     252             : 
     253             :   // For stores the stored value is also relevant
     254         114 :   if (!MI.getDesc().mayStore())
     255             :     return false;
     256             : 
     257             :   // Check if this operand is the value being stored.
     258             :   // Special case for DS/FLAT instructions, since the address
     259             :   // operand comes before the value operand and it may have
     260             :   // multiple data operands.
     261             : 
     262          30 :   if (TII->isDS(MI)) {
     263           0 :     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
     264           0 :     if (Data0 && Op.isIdenticalTo(*Data0))
     265             :       return true;
     266             : 
     267           0 :     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
     268           0 :     return Data1 && Op.isIdenticalTo(*Data1);
     269             :   }
     270             : 
     271          30 :   if (TII->isFLAT(MI)) {
     272          12 :     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
     273          12 :     if (Data && Op.isIdenticalTo(*Data))
     274             :       return true;
     275             :   }
     276             : 
     277             :   // NOTE: This assumes that the value operand is before the
     278             :   // address operand, and that there is only one value operand.
     279          27 :   for (MachineInstr::mop_iterator I = MI.operands_begin(),
     280          54 :        E = MI.operands_end(); I != E; ++I) {
     281             : 
     282          54 :     if (I->isReg() && I->isUse())
     283          27 :       return Op.isIdenticalTo(*I);
     284             :   }
     285             : 
     286             :   return false;
     287             : }
     288             : 
     289             : RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
     290             :                                           const MachineOperand &Reg) const {
     291             :   unsigned Size = TRI->getRegSizeInBits(*RC);
     292             :   assert(Size >= 32);
     293             : 
     294             :   RegInterval Result;
     295         270 :   Result.first = TRI->getEncodingValue(Reg.getReg());
     296         135 :   Result.second = Result.first + Size / 32;
     297             : 
     298             :   return Result;
     299             : }
     300             : 
     301          55 : void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     302             :                                     MachineBasicBlock::iterator I,
     303             :                                     const Counters &Increment) {
     304             :   // Get the hardware counter increments and sum them up
     305          55 :   Counters Limit = ZeroCounts;
     306             :   unsigned Sum = 0;
     307             : 
     308          55 :   if (TII->mayAccessFlatAddressSpace(*I))
     309           8 :     IsFlatOutstanding = true;
     310             : 
     311         385 :   for (unsigned i = 0; i < 3; ++i) {
     312         165 :     LastIssued.Array[i] += Increment.Array[i];
     313         165 :     if (Increment.Array[i])
     314          37 :       Limit.Array[i] = LastIssued.Array[i];
     315         165 :     Sum += Increment.Array[i];
     316             :   }
     317             : 
     318             :   // If we don't increase anything then that's it
     319          55 :   if (Sum == 0) {
     320          35 :     LastOpcodeType = OTHER;
     321          35 :     return;
     322             :   }
     323             : 
     324          20 :   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     325             :     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     326             :     // or SMEM clause, respectively.
     327             :     //
     328             :     // The temporary workaround is to break the clauses with S_NOP.
     329             :     //
     330             :     // The proper solution would be to allocate registers such that all source
     331             :     // and destination registers don't overlap, e.g. this is illegal:
     332             :     //   r0 = load r2
     333             :     //   r2 = load r0
     334          11 :     if (LastOpcodeType == VMEM && Increment.Named.VM) {
     335             :       // Insert a NOP to break the clause.
     336           4 :       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
     337             :           .addImm(0);
     338           1 :       LastInstWritesM0 = false;
     339             :     }
     340             : 
     341          11 :     if (TII->isSMRD(*I))
     342           0 :       LastOpcodeType = SMEM;
     343          11 :     else if (Increment.Named.VM)
     344          11 :       LastOpcodeType = VMEM;
     345             :   }
     346             : 
     347             :   // Remember which export instructions we have seen
     348          20 :   if (Increment.Named.EXP) {
     349           6 :     ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
     350             :   }
     351             : 
     352         157 :   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
     353         137 :     MachineOperand &Op = I->getOperand(i);
     354         137 :     if (!isOpRelevant(Op))
     355             :       continue;
     356             : 
     357          23 :     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
     358          23 :     RegInterval Interval = getRegInterval(RC, Op);
     359          97 :     for (unsigned j = Interval.first; j < Interval.second; ++j) {
     360             : 
     361             :       // Remember which registers we define
     362          37 :       if (Op.isDef())
     363          22 :         DefinedRegs[j] = Limit;
     364             : 
     365             :       // and which one we are using
     366          37 :       if (Op.isUse())
     367          15 :         UsedRegs[j] = Limit;
     368             :     }
     369             :   }
     370             : }
     371             : 
     372          69 : bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
     373             :                                MachineBasicBlock::iterator I,
     374             :                                const Counters &Required) {
     375             :   // End of program? No need to wait on anything
     376             :   // A function not returning void needs to wait, because other bytecode will
     377             :   // be appended after it and we don't know what it will be.
     378         136 :   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
     379             :     return false;
     380             : 
     381             :   // Figure out if the async instructions execute in order
     382             :   bool Ordered[3];
     383             : 
     384             :   // VM_CNT is always ordered except when there are flat instructions, which
     385             :   // can return out of order.
     386          57 :   Ordered[0] = !IsFlatOutstanding;
     387             : 
     388             :   // EXP_CNT is unordered if we have both EXP & VM-writes
     389          57 :   Ordered[1] = ExpInstrTypesSeen == 3;
     390             : 
     391             :   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
     392          57 :   Ordered[2] = false;
     393             : 
     394             :   // The values we are going to put into the S_WAITCNT instruction
     395          57 :   Counters Counts = HardwareLimits;
     396             : 
     397             :   // Do we really need to wait?
     398             :   bool NeedWait = false;
     399             : 
     400         399 :   for (unsigned i = 0; i < 3; ++i) {
     401         171 :     if (Required.Array[i] <= WaitedOn.Array[i])
     402         151 :       continue;
     403             : 
     404             :     NeedWait = true;
     405             : 
     406          20 :     if (Ordered[i]) {
     407           5 :       unsigned Value = LastIssued.Array[i] - Required.Array[i];
     408             : 
     409             :       // Adjust the value to the real hardware possibilities.
     410          10 :       Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
     411             :     } else
     412          15 :       Counts.Array[i] = 0;
     413             : 
     414             :     // Remember on what we have waited on.
     415          20 :     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
     416             :   }
     417             : 
     418          57 :   if (!NeedWait)
     419             :     return false;
     420             : 
     421             :   // Reset EXP_CNT instruction types
     422          13 :   if (Counts.Named.EXP == 0)
     423           4 :     ExpInstrTypesSeen = 0;
     424             : 
     425             :   // Build the wait instruction
     426          52 :   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
     427          13 :     .addImm(AMDGPU::encodeWaitcnt(ISA,
     428             :                                   Counts.Named.VM,
     429             :                                   Counts.Named.EXP,
     430          13 :                                   Counts.Named.LGKM));
     431             : 
     432          13 :   LastOpcodeType = OTHER;
     433          13 :   LastInstWritesM0 = false;
     434          13 :   IsFlatOutstanding = false;
     435          13 :   return true;
     436             : }
     437             : 
     438             : /// helper function for handleOperands
     439             : static void increaseCounters(Counters &Dst, const Counters &Src) {
     440        1786 :   for (unsigned i = 0; i < 3; ++i)
     441        1692 :     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
     442             : }
     443             : 
     444             : /// check whether any of the counters is non-zero
     445             : static bool countersNonZero(const Counters &Counter) {
     446         479 :   for (unsigned i = 0; i < 3; ++i)
     447         225 :     if (Counter.Array[i])
     448             :       return true;
     449             :   return false;
     450             : }
     451             : 
     452           0 : void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
     453             :   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
     454             : 
     455           0 :   unsigned Imm = I->getOperand(0).getImm();
     456             :   Counters Counts, WaitOn;
     457             : 
     458           0 :   Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
     459           0 :   Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
     460           0 :   Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
     461             : 
     462           0 :   for (unsigned i = 0; i < 3; ++i) {
     463           0 :     if (Counts.Array[i] <= LastIssued.Array[i])
     464           0 :       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
     465             :     else
     466           0 :       WaitOn.Array[i] = 0;
     467             :   }
     468             : 
     469             :   increaseCounters(DelayedWaitOn, WaitOn);
     470           0 : }
     471             : 
     472          55 : Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
     473          55 :   Counters Result = ZeroCounts;
     474             : 
     475             :   // For each register affected by this instruction increase the result
     476             :   // sequence.
     477             :   //
     478             :   // TODO: We could probably just look at explicit operands if we removed VCC /
     479             :   // EXEC from SMRD dest reg classes.
     480         265 :   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     481         210 :     MachineOperand &Op = MI.getOperand(i);
     482         322 :     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     483             :       continue;
     484             : 
     485         112 :     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
     486         112 :     RegInterval Interval = getRegInterval(RC, Op);
     487         520 :     for (unsigned j = Interval.first; j < Interval.second; ++j) {
     488         204 :       if (Op.isDef()) {
     489             :         increaseCounters(Result, UsedRegs[j]);
     490             :         increaseCounters(Result, DefinedRegs[j]);
     491             :       }
     492             : 
     493         204 :       if (Op.isUse())
     494             :         increaseCounters(Result, DefinedRegs[j]);
     495             :     }
     496             :   }
     497             : 
     498          55 :   return Result;
     499             : }
     500             : 
     501          55 : void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
     502             :                                   MachineBasicBlock::iterator I) {
     503          55 :   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
     504             :     return;
     505             : 
     506             :   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
     507          23 :   if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
     508           0 :     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
     509           0 :     LastInstWritesM0 = false;
     510           0 :     return;
     511             :   }
     512             : 
     513             :   // Set whether this instruction sets M0
     514          23 :   LastInstWritesM0 = false;
     515             : 
     516          23 :   unsigned NumOperands = I->getNumOperands();
     517         217 :   for (unsigned i = 0; i < NumOperands; i++) {
     518          97 :     const MachineOperand &Op = I->getOperand(i);
     519             : 
     520         156 :     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
     521           0 :       LastInstWritesM0 = true;
     522             :   }
     523             : }
     524             : 
     525             : /// Return true if \p MBB has one successor immediately following, and is its
     526             : /// only predecessor
     527          16 : static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
     528          16 :   if (MBB.succ_size() != 1)
     529             :     return false;
     530             : 
     531           8 :   const MachineBasicBlock *Succ = *MBB.succ_begin();
     532           8 :   return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
     533             : }
     534             : 
     535             : // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
     536             : // around other non-memory instructions.
     537           5 : bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     538             :   bool Changes = false;
     539             : 
     540           5 :   ST = &MF.getSubtarget<SISubtarget>();
     541           5 :   TII = ST->getInstrInfo();
     542           5 :   TRI = &TII->getRegisterInfo();
     543           5 :   MRI = &MF.getRegInfo();
     544           5 :   ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
     545           5 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     546             : 
     547           5 :   HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
     548           5 :   HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
     549           5 :   HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
     550             : 
     551           5 :   WaitedOn = ZeroCounts;
     552           5 :   DelayedWaitOn = ZeroCounts;
     553           5 :   LastIssued = ZeroCounts;
     554           5 :   LastOpcodeType = OTHER;
     555           5 :   LastInstWritesM0 = false;
     556           5 :   IsFlatOutstanding = false;
     557           5 :   ReturnsVoid = MFI->returnsVoid();
     558             : 
     559           5 :   memset(&UsedRegs, 0, sizeof(UsedRegs));
     560           5 :   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
     561             : 
     562             :   SmallVector<MachineInstr *, 4> RemoveMI;
     563             :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
     564             : 
     565             :   bool HaveScalarStores = false;
     566             : 
     567             :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     568          21 :        BI != BE; ++BI) {
     569             :     MachineBasicBlock &MBB = *BI;
     570             : 
     571          16 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
     572          71 :          I != E; ++I) {
     573         110 :       if (!HaveScalarStores && TII->isScalarStore(*I))
     574             :         HaveScalarStores = true;
     575             : 
     576          55 :       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
     577             :         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
     578             :         // vccz bit, so when we detect that an instruction may read from a
     579             :         // corrupt vccz bit, we need to:
     580             :         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
     581             :         //    complete.
     582             :         // 2. Restore the correct value of vccz by writing the current value
     583             :         //    of vcc back to vcc.
     584             : 
     585          64 :         if (TII->isSMRD(I->getOpcode())) {
     586           3 :           VCCZCorrupt = true;
     587          52 :         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
     588             :           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
     589             :           // Whenever we store a value in vcc, the correct value of vccz is
     590             :           // restored.
     591           0 :           VCCZCorrupt = false;
     592             :         }
     593             : 
     594             :         // Check if we need to apply the bug work-around
     595          32 :         if (VCCZCorrupt && readsVCCZ(*I)) {
     596             :           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
     597             : 
     598             :           // Wait on everything, not just LGKM.  vccz reads usually come from
     599             :           // terminators, and we always wait on everything at the end of the
     600             :           // block, so if we only wait on LGKM here, we might end up with
     601             :           // another s_waitcnt inserted right after this if there are non-LGKM
     602             :           // instructions still outstanding.
     603           1 :           insertWait(MBB, I, LastIssued);
     604             : 
     605             :           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
     606             :           // bit is updated, so we can restore the bit by reading the value of
     607             :           // vcc and then writing it back to the register.
     608           3 :           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
     609           1 :                   AMDGPU::VCC)
     610           1 :             .addReg(AMDGPU::VCC);
     611             :         }
     612             :       }
     613             : 
     614             :       // Record pre-existing, explicitly requested waits
     615         110 :       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
     616           0 :         handleExistingWait(*I);
     617           0 :         RemoveMI.push_back(&*I);
     618           0 :         continue;
     619             :       }
     620             : 
     621             :       Counters Required;
     622             : 
     623             :       // Wait for everything before a barrier.
     624             :       //
     625             :       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
     626             :       // but we also want to wait for any other outstanding transfers before
     627             :       // signalling other hardware blocks
     628           0 :       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
     629          55 :                !ST->hasAutoWaitcntBeforeBarrier()) ||
     630         110 :            I->getOpcode() == AMDGPU::S_SENDMSG ||
     631             :            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
     632           0 :         Required = LastIssued;
     633             :       else
     634          55 :         Required = handleOperands(*I);
     635             : 
     636          55 :       Counters Increment = getHwCounts(*I);
     637             : 
     638          91 :       if (countersNonZero(Required) || countersNonZero(Increment))
     639             :         increaseCounters(Required, DelayedWaitOn);
     640             : 
     641          55 :       Changes |= insertWait(MBB, I, Required);
     642             : 
     643          55 :       pushInstruction(MBB, I, Increment);
     644          55 :       handleSendMsg(MBB, I);
     645             : 
     646         110 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
     647             :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
     648           6 :         EndPgmBlocks.push_back(&MBB);
     649             :     }
     650             : 
     651             :     // Wait for everything at the end of the MBB. If there is only one
     652             :     // successor, we can defer this until the uses there.
     653          16 :     if (!hasTrivialSuccessor(MBB))
     654          13 :       Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
     655             :   }
     656             : 
     657           5 :   if (HaveScalarStores) {
     658             :     // If scalar writes are used, the cache must be flushed or else the next
     659             :     // wave to reuse the same scratch memory can be clobbered.
     660             :     //
     661             :     // Insert s_dcache_wb at wave termination points if there were any scalar
     662             :     // stores, and only if the cache hasn't already been flushed. This could be
     663             :     // improved by looking across blocks for flushes in postdominating blocks
     664             :     // from the stores but an explicitly requested flush is probably very rare.
     665           0 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
     666             :       bool SeenDCacheWB = false;
     667             : 
     668           0 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
     669           0 :            I != E; ++I) {
     670           0 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
     671             :           SeenDCacheWB = true;
     672           0 :         else if (TII->isScalarStore(*I))
     673             :           SeenDCacheWB = false;
     674             : 
     675             :         // FIXME: It would be better to insert this before a waitcnt if any.
     676           0 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
     677           0 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
     678             :           Changes = true;
     679           0 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
     680             :         }
     681             :       }
     682             :     }
     683             :   }
     684             : 
     685           5 :   for (MachineInstr *I : RemoveMI)
     686           0 :     I->eraseFromParent();
     687             : 
     688           5 :   if (!MFI->isEntryFunction()) {
     689             :     // Wait for any outstanding memory operations that the input registers may
     690             :     // depend on. We can't track them and it's better to the wait after the
     691             :     // costly call sequence.
     692             : 
     693             :     // TODO: Could insert earlier and schedule more liberally with operations
     694             :     // that only use caller preserved registers.
     695             :     MachineBasicBlock &EntryBB = MF.front();
     696           0 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
     697             :       .addImm(0);
     698             : 
     699             :     Changes = true;
     700             :   }
     701             : 
     702           5 :   return Changes;
     703             : }

Generated by: LCOV version 1.13