LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaits.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 204 232 87.9 %
Date: 2017-09-14 15:23:50 Functions: 13 16 81.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/SmallVector.h"
      27             : #include "llvm/ADT/StringRef.h"
      28             : #include "llvm/CodeGen/MachineBasicBlock.h"
      29             : #include "llvm/CodeGen/MachineFunction.h"
      30             : #include "llvm/CodeGen/MachineFunctionPass.h"
      31             : #include "llvm/CodeGen/MachineInstr.h"
      32             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      33             : #include "llvm/CodeGen/MachineOperand.h"
      34             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      35             : #include "llvm/IR/DebugLoc.h"
      36             : #include "llvm/MC/MCInstrDesc.h"
      37             : #include "llvm/Pass.h"
      38             : #include "llvm/Support/Debug.h"
      39             : #include "llvm/Support/raw_ostream.h"
      40             : #include <algorithm>
      41             : #include <cassert>
      42             : #include <cstdint>
      43             : #include <cstring>
      44             : #include <utility>
      45             : 
      46             : #define DEBUG_TYPE "si-insert-waits"
      47             : 
      48             : using namespace llvm;
      49             : 
      50             : namespace {
      51             : 
      52             : /// \brief One variable for each of the hardware counters
      53             : using Counters = union {
      54             :   struct {
      55             :     unsigned VM;
      56             :     unsigned EXP;
      57             :     unsigned LGKM;
      58             :   } Named;
      59             :   unsigned Array[3];
      60             : };
      61             : 
      62             : using InstType = enum {
      63             :   OTHER,
      64             :   SMEM,
      65             :   VMEM
      66             : };
      67             : 
      68             : using RegCounters =  Counters[512];
      69             : using RegInterval = std::pair<unsigned, unsigned>;
      70             : 
      71           6 : class SIInsertWaits : public MachineFunctionPass {
      72             : private:
      73             :   const SISubtarget *ST = nullptr;
      74             :   const SIInstrInfo *TII = nullptr;
      75             :   const SIRegisterInfo *TRI = nullptr;
      76             :   const MachineRegisterInfo *MRI;
      77             :   AMDGPU::IsaInfo::IsaVersion ISA;
      78             : 
      79             :   /// \brief Constant zero value
      80             :   static const Counters ZeroCounts;
      81             : 
      82             :   /// \brief Hardware limits
      83             :   Counters HardwareLimits;
      84             : 
      85             :   /// \brief Counter values we have already waited on.
      86             :   Counters WaitedOn;
      87             : 
      88             :   /// \brief Counter values that we must wait on before the next counter
      89             :   /// increase.
      90             :   Counters DelayedWaitOn;
      91             : 
      92             :   /// \brief Counter values for last instruction issued.
      93             :   Counters LastIssued;
      94             : 
      95             :   /// \brief Registers used by async instructions.
      96             :   RegCounters UsedRegs;
      97             : 
      98             :   /// \brief Registers defined by async instructions.
      99             :   RegCounters DefinedRegs;
     100             : 
     101             :   /// \brief Different export instruction types seen since last wait.
     102             :   unsigned ExpInstrTypesSeen = 0;
     103             : 
     104             :   /// \brief Type of the last opcode.
     105             :   InstType LastOpcodeType;
     106             : 
     107             :   bool LastInstWritesM0;
     108             : 
     109             :   /// Whether or not we have flat operations outstanding.
     110             :   bool IsFlatOutstanding;
     111             : 
     112             :   /// \brief Whether the machine function returns void
     113             :   bool ReturnsVoid;
     114             : 
     115             :   /// Whether the VCCZ bit is possibly corrupt
     116             :   bool VCCZCorrupt = false;
     117             : 
     118             :   /// \brief Get increment/decrement amount for this instruction.
     119             :   Counters getHwCounts(MachineInstr &MI);
     120             : 
     121             :   /// \brief Is operand relevant for async execution?
     122             :   bool isOpRelevant(MachineOperand &Op);
     123             : 
     124             :   /// \brief Get register interval an operand affects.
     125             :   RegInterval getRegInterval(const TargetRegisterClass *RC,
     126             :                              const MachineOperand &Reg) const;
     127             : 
     128             :   /// \brief Handle instructions async components
     129             :   void pushInstruction(MachineBasicBlock &MBB,
     130             :                        MachineBasicBlock::iterator I,
     131             :                        const Counters& Increment);
     132             : 
     133             :   /// \brief Insert the actual wait instruction
     134             :   bool insertWait(MachineBasicBlock &MBB,
     135             :                   MachineBasicBlock::iterator I,
     136             :                   const Counters &Counts);
     137             : 
     138             :   /// \brief Handle existing wait instructions (from intrinsics)
     139             :   void handleExistingWait(MachineBasicBlock::iterator I);
     140             : 
     141             :   /// \brief Do we need def2def checks?
     142             :   bool unorderedDefines(MachineInstr &MI);
     143             : 
     144             :   /// \brief Resolve all operand dependencies to counter requirements
     145             :   Counters handleOperands(MachineInstr &MI);
     146             : 
     147             :   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
     148             :   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
     149             : 
     150             :   /// Return true if there are LGKM instrucitons that haven't been waited on
     151             :   /// yet.
     152             :   bool hasOutstandingLGKM() const;
     153             : 
     154             : public:
     155             :   static char ID;
     156             : 
     157           6 :   SIInsertWaits() : MachineFunctionPass(ID) {}
     158             : 
     159             :   bool runOnMachineFunction(MachineFunction &MF) override;
     160             : 
     161           6 :   StringRef getPassName() const override {
     162           6 :     return "SI insert wait instructions";
     163             :   }
     164             : 
     165           6 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     166           6 :     AU.setPreservesCFG();
     167           6 :     MachineFunctionPass::getAnalysisUsage(AU);
     168           6 :   }
     169             : };
     170             : 
     171             : } // end anonymous namespace
     172             : 
     173       53042 : INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
     174             :                       "SI Insert Waits", false, false)
     175      312538 : INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
     176             :                     "SI Insert Waits", false, false)
     177             : 
     178             : char SIInsertWaits::ID = 0;
     179             : 
     180             : char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
     181             : 
     182           0 : FunctionPass *llvm::createSIInsertWaitsPass() {
     183           0 :   return new SIInsertWaits();
     184             : }
     185             : 
     186             : const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
     187             : 
     188             : static bool readsVCCZ(const MachineInstr &MI) {
     189          50 :   unsigned Opc = MI.getOpcode();
     190          52 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     191           4 :          !MI.getOperand(1).isUndef();
     192             : }
     193             : 
     194             : bool SIInsertWaits::hasOutstandingLGKM() const {
     195             :   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
     196             : }
     197             : 
     198          90 : Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
     199          90 :   uint64_t TSFlags = MI.getDesc().TSFlags;
     200          90 :   Counters Result = { { 0, 0, 0 } };
     201             : 
     202          90 :   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
     203             : 
     204             :   // Only consider stores or EXP for EXP_CNT
     205          90 :   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
     206             : 
     207             :   // LGKM may uses larger values
     208          90 :   if (TSFlags & SIInstrFlags::LGKM_CNT) {
     209             : 
     210          24 :     if (TII->isSMRD(MI)) {
     211             : 
     212          12 :       if (MI.getNumOperands() != 0) {
     213             :         assert(MI.getOperand(0).isReg() &&
     214             :                "First LGKM operand must be a register!");
     215             : 
     216             :         // XXX - What if this is a write into a super register?
     217          10 :         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
     218          20 :         unsigned Size = TRI->getRegSizeInBits(*RC);
     219          10 :         Result.Named.LGKM = Size > 32 ? 2 : 1;
     220             :       } else {
     221             :         // s_dcache_inv etc. do not have a a destination register. Assume we
     222             :         // want a wait on these.
     223             :         // XXX - What is the right value?
     224             :         Result.Named.LGKM = 1;
     225             :       }
     226             :     } else {
     227             :       // DS
     228             :       Result.Named.LGKM = 1;
     229             :     }
     230             : 
     231             :   } else {
     232             :     Result.Named.LGKM = 0;
     233             :   }
     234             : 
     235          90 :   return Result;
     236             : }
     237             : 
     238         211 : bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
     239             :   // Constants are always irrelevant
     240         321 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     241             :     return false;
     242             : 
     243             :   // Defines are always relevant
     244         110 :   if (Op.isDef())
     245             :     return true;
     246             : 
     247             :   // For exports all registers are relevant.
     248             :   // TODO: Skip undef/disabled registers.
     249          94 :   MachineInstr &MI = *Op.getParent();
     250          94 :   if (TII->isEXP(MI))
     251             :     return true;
     252             : 
     253             :   // For stores the stored value is also relevant
     254          89 :   if (!MI.getDesc().mayStore())
     255             :     return false;
     256             : 
     257             :   // Check if this operand is the value being stored.
     258             :   // Special case for DS/FLAT instructions, since the address
     259             :   // operand comes before the value operand and it may have
     260             :   // multiple data operands.
     261             : 
     262          51 :   if (TII->isDS(MI)) {
     263           0 :     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
     264           0 :     if (Data0 && Op.isIdenticalTo(*Data0))
     265             :       return true;
     266             : 
     267           0 :     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
     268           0 :     return Data1 && Op.isIdenticalTo(*Data1);
     269             :   }
     270             : 
     271          51 :   if (TII->isFLAT(MI)) {
     272          12 :     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
     273          12 :     if (Data && Op.isIdenticalTo(*Data))
     274             :       return true;
     275             :   }
     276             : 
     277             :   // NOTE: This assumes that the value operand is before the
     278             :   // address operand, and that there is only one value operand.
     279          48 :   for (MachineInstr::mop_iterator I = MI.operands_begin(),
     280          48 :        E = MI.operands_end(); I != E; ++I) {
     281             : 
     282          96 :     if (I->isReg() && I->isUse())
     283          48 :       return Op.isIdenticalTo(*I);
     284             :   }
     285             : 
     286             :   return false;
     287             : }
     288             : 
     289             : RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
     290             :                                           const MachineOperand &Reg) const {
     291         438 :   unsigned Size = TRI->getRegSizeInBits(*RC);
     292             :   assert(Size >= 32);
     293             : 
     294         219 :   RegInterval Result;
     295         438 :   Result.first = TRI->getEncodingValue(Reg.getReg());
     296         219 :   Result.second = Result.first + Size / 32;
     297             : 
     298             :   return Result;
     299             : }
     300             : 
     301          90 : void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     302             :                                     MachineBasicBlock::iterator I,
     303             :                                     const Counters &Increment) {
     304             :   // Get the hardware counter increments and sum them up
     305          90 :   Counters Limit = ZeroCounts;
     306          90 :   unsigned Sum = 0;
     307             : 
     308         180 :   if (TII->mayAccessFlatAddressSpace(*I))
     309           8 :     IsFlatOutstanding = true;
     310             : 
     311         630 :   for (unsigned i = 0; i < 3; ++i) {
     312         270 :     LastIssued.Array[i] += Increment.Array[i];
     313         270 :     if (Increment.Array[i])
     314          52 :       Limit.Array[i] = LastIssued.Array[i];
     315         270 :     Sum += Increment.Array[i];
     316             :   }
     317             : 
     318             :   // If we don't increase anything then that's it
     319          90 :   if (Sum == 0) {
     320          55 :     LastOpcodeType = OTHER;
     321          55 :     return;
     322             :   }
     323             : 
     324          35 :   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     325             :     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     326             :     // or SMEM clause, respectively.
     327             :     //
     328             :     // The temporary workaround is to break the clauses with S_NOP.
     329             :     //
     330             :     // The proper solution would be to allocate registers such that all source
     331             :     // and destination registers don't overlap, e.g. this is illegal:
     332             :     //   r0 = load r2
     333             :     //   r2 = load r0
     334          12 :     if (LastOpcodeType == VMEM && Increment.Named.VM) {
     335             :       // Insert a NOP to break the clause.
     336           5 :       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
     337           1 :           .addImm(0);
     338           1 :       LastInstWritesM0 = false;
     339             :     }
     340             : 
     341          24 :     if (TII->isSMRD(*I))
     342           0 :       LastOpcodeType = SMEM;
     343          12 :     else if (Increment.Named.VM)
     344          11 :       LastOpcodeType = VMEM;
     345             :   }
     346             : 
     347             :   // Remember which export instructions we have seen
     348          35 :   if (Increment.Named.EXP) {
     349          14 :     ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
     350             :   }
     351             : 
     352         281 :   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
     353         422 :     MachineOperand &Op = I->getOperand(i);
     354         211 :     if (!isOpRelevant(Op))
     355         171 :       continue;
     356             : 
     357          80 :     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
     358          80 :     RegInterval Interval = getRegInterval(RC, Op);
     359          95 :     for (unsigned j = Interval.first; j < Interval.second; ++j) {
     360             : 
     361             :       // Remember which registers we define
     362          55 :       if (Op.isDef())
     363          27 :         DefinedRegs[j] = Limit;
     364             : 
     365             :       // and which one we are using
     366          55 :       if (Op.isUse())
     367          28 :         UsedRegs[j] = Limit;
     368             :     }
     369             :   }
     370             : }
     371             : 
     372         116 : bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
     373             :                                MachineBasicBlock::iterator I,
     374             :                                const Counters &Required) {
     375             :   // End of program? No need to wait on anything
     376             :   // A function not returning void needs to wait, because other bytecode will
     377             :   // be appended after it and we don't know what it will be.
     378         460 :   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
     379             :     return false;
     380             : 
     381             :   // Figure out if the async instructions execute in order
     382             :   bool Ordered[3];
     383             : 
     384             :   // VM_CNT is always ordered except when there are flat instructions, which
     385             :   // can return out of order.
     386          88 :   Ordered[0] = !IsFlatOutstanding;
     387             : 
     388             :   // EXP_CNT is unordered if we have both EXP & VM-writes
     389          88 :   Ordered[1] = ExpInstrTypesSeen == 3;
     390             : 
     391             :   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
     392          88 :   Ordered[2] = false;
     393             : 
     394             :   // The values we are going to put into the S_WAITCNT instruction
     395          88 :   Counters Counts = HardwareLimits;
     396             : 
     397             :   // Do we really need to wait?
     398          88 :   bool NeedWait = false;
     399             : 
     400         352 :   for (unsigned i = 0; i < 3; ++i) {
     401         264 :     if (Required.Array[i] <= WaitedOn.Array[i])
     402         240 :       continue;
     403             : 
     404          24 :     NeedWait = true;
     405             : 
     406          24 :     if (Ordered[i]) {
     407           6 :       unsigned Value = LastIssued.Array[i] - Required.Array[i];
     408             : 
     409             :       // Adjust the value to the real hardware possibilities.
     410          12 :       Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
     411             :     } else
     412          18 :       Counts.Array[i] = 0;
     413             : 
     414             :     // Remember on what we have waited on.
     415          24 :     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
     416             :   }
     417             : 
     418          88 :   if (!NeedWait)
     419             :     return false;
     420             : 
     421             :   // Reset EXP_CNT instruction types
     422          17 :   if (Counts.Named.EXP == 0)
     423           5 :     ExpInstrTypesSeen = 0;
     424             : 
     425             :   // Build the wait instruction
     426          85 :   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
     427          34 :     .addImm(AMDGPU::encodeWaitcnt(ISA,
     428             :                                   Counts.Named.VM,
     429             :                                   Counts.Named.EXP,
     430          17 :                                   Counts.Named.LGKM));
     431             : 
     432          17 :   LastOpcodeType = OTHER;
     433          17 :   LastInstWritesM0 = false;
     434          17 :   IsFlatOutstanding = false;
     435          17 :   return true;
     436             : }
     437             : 
     438             : /// \brief helper function for handleOperands
     439             : static void increaseCounters(Counters &Dst, const Counters &Src) {
     440        2679 :   for (unsigned i = 0; i < 3; ++i)
     441        2502 :     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
     442             : }
     443             : 
     444             : /// \brief check whether any of the counters is non-zero
     445             : static bool countersNonZero(const Counters &Counter) {
     446         829 :   for (unsigned i = 0; i < 3; ++i)
     447         391 :     if (Counter.Array[i])
     448             :       return true;
     449             :   return false;
     450             : }
     451             : 
     452           0 : void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
     453             :   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
     454             : 
     455           0 :   unsigned Imm = I->getOperand(0).getImm();
     456             :   Counters Counts, WaitOn;
     457             : 
     458           0 :   Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
     459           0 :   Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
     460           0 :   Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
     461             : 
     462           0 :   for (unsigned i = 0; i < 3; ++i) {
     463           0 :     if (Counts.Array[i] <= LastIssued.Array[i])
     464           0 :       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
     465             :     else
     466           0 :       WaitOn.Array[i] = 0;
     467             :   }
     468             : 
     469           0 :   increaseCounters(DelayedWaitOn, WaitOn);
     470           0 : }
     471             : 
     472          90 : Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
     473          90 :   Counters Result = ZeroCounts;
     474             : 
     475             :   // For each register affected by this instruction increase the result
     476             :   // sequence.
     477             :   //
     478             :   // TODO: We could probably just look at explicit operands if we removed VCC /
     479             :   // EXEC from SMRD dest reg classes.
     480         406 :   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     481         632 :     MachineOperand &Op = MI.getOperand(i);
     482         495 :     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     483         137 :       continue;
     484             : 
     485         179 :     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
     486         358 :     RegInterval Interval = getRegInterval(RC, Op);
     487         483 :     for (unsigned j = Interval.first; j < Interval.second; ++j) {
     488         304 :       if (Op.isDef()) {
     489             :         increaseCounters(Result, UsedRegs[j]);
     490             :         increaseCounters(Result, DefinedRegs[j]);
     491             :       }
     492             : 
     493         304 :       if (Op.isUse())
     494             :         increaseCounters(Result, DefinedRegs[j]);
     495             :     }
     496             :   }
     497             : 
     498          90 :   return Result;
     499             : }
     500             : 
     501          90 : void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
     502             :                                   MachineBasicBlock::iterator I) {
     503          90 :   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
     504             :     return;
     505             : 
     506             :   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
     507          28 :   if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
     508           0 :     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
     509           0 :     LastInstWritesM0 = false;
     510           0 :     return;
     511             :   }
     512             : 
     513             :   // Set whether this instruction sets M0
     514          28 :   LastInstWritesM0 = false;
     515             : 
     516          28 :   unsigned NumOperands = I->getNumOperands();
     517         282 :   for (unsigned i = 0; i < NumOperands; i++) {
     518         226 :     const MachineOperand &Op = I->getOperand(i);
     519             : 
     520         186 :     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
     521           0 :       LastInstWritesM0 = true;
     522             :   }
     523             : }
     524             : 
     525             : /// Return true if \p MBB has one successor immediately following, and is its
     526             : /// only predecessor
     527          28 : static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
     528          28 :   if (MBB.succ_size() != 1)
     529             :     return false;
     530             : 
     531           8 :   const MachineBasicBlock *Succ = *MBB.succ_begin();
     532           8 :   return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
     533             : }
     534             : 
     535             : // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
     536             : // around other non-memory instructions.
     537          15 : bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     538          15 :   bool Changes = false;
     539             : 
     540          15 :   ST = &MF.getSubtarget<SISubtarget>();
     541          30 :   TII = ST->getInstrInfo();
     542          30 :   TRI = &TII->getRegisterInfo();
     543          15 :   MRI = &MF.getRegInfo();
     544          30 :   ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
     545          15 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     546             : 
     547          15 :   HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
     548          15 :   HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
     549          15 :   HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
     550             : 
     551          15 :   WaitedOn = ZeroCounts;
     552          15 :   DelayedWaitOn = ZeroCounts;
     553          15 :   LastIssued = ZeroCounts;
     554          15 :   LastOpcodeType = OTHER;
     555          15 :   LastInstWritesM0 = false;
     556          15 :   IsFlatOutstanding = false;
     557          15 :   ReturnsVoid = MFI->returnsVoid();
     558             : 
     559          15 :   memset(&UsedRegs, 0, sizeof(UsedRegs));
     560          15 :   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
     561             : 
     562          30 :   SmallVector<MachineInstr *, 4> RemoveMI;
     563          30 :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
     564             : 
     565          15 :   bool HaveScalarStores = false;
     566             : 
     567          30 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     568          43 :        BI != BE; ++BI) {
     569          28 :     MachineBasicBlock &MBB = *BI;
     570             : 
     571          56 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
     572         118 :          I != E; ++I) {
     573         252 :       if (!HaveScalarStores && TII->isScalarStore(*I))
     574             :         HaveScalarStores = true;
     575             : 
     576          90 :       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
     577             :         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
     578             :         // vccz bit, so when we detect that an instruction may read from a
     579             :         // corrupt vccz bit, we need to:
     580             :         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
     581             :         //    complete.
     582             :         // 2. Restore the correct value of vccz by writing the current value
     583             :         //    of vcc back to vcc.
     584             : 
     585         248 :         if (TII->isSMRD(I->getOpcode())) {
     586          12 :           VCCZCorrupt = true;
     587         124 :         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
     588             :           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
     589             :           // Whenever we store a value in vcc, the correct value of vccz is
     590             :           // restored.
     591           0 :           VCCZCorrupt = false;
     592             :         }
     593             : 
     594             :         // Check if we need to apply the bug work-around
     595         113 :         if (VCCZCorrupt && readsVCCZ(*I)) {
     596             :           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
     597             : 
     598             :           // Wait on everything, not just LGKM.  vccz reads usually come from
     599             :           // terminators, and we always wait on everything at the end of the
     600             :           // block, so if we only wait on LGKM here, we might end up with
     601             :           // another s_waitcnt inserted right after this if there are non-LGKM
     602             :           // instructions still outstanding.
     603           1 :           insertWait(MBB, I, LastIssued);
     604             : 
     605             :           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
     606             :           // bit is updated, so we can restore the bit by reading the value of
     607             :           // vcc and then writing it back to the register.
     608           3 :           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
     609           4 :                   AMDGPU::VCC)
     610           1 :             .addReg(AMDGPU::VCC);
     611             :         }
     612             :       }
     613             : 
     614             :       // Record pre-existing, explicitly requested waits
     615         180 :       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
     616           0 :         handleExistingWait(*I);
     617           0 :         RemoveMI.push_back(&*I);
     618           0 :         continue;
     619             :       }
     620             : 
     621             :       Counters Required;
     622             : 
     623             :       // Wait for everything before a barrier.
     624             :       //
     625             :       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
     626             :       // but we also want to wait for any other outstanding transfers before
     627             :       // signalling other hardware blocks
     628          90 :       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
     629          90 :                !ST->hasAutoWaitcntBeforeBarrier()) ||
     630         270 :            I->getOpcode() == AMDGPU::S_SENDMSG ||
     631          90 :            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
     632           0 :         Required = LastIssued;
     633             :       else
     634          90 :         Required = handleOperands(*I);
     635             : 
     636          90 :       Counters Increment = getHwCounts(*I);
     637             : 
     638         153 :       if (countersNonZero(Required) || countersNonZero(Increment))
     639             :         increaseCounters(Required, DelayedWaitOn);
     640             : 
     641          90 :       Changes |= insertWait(MBB, I, Required);
     642             : 
     643          90 :       pushInstruction(MBB, I, Increment);
     644          90 :       handleSendMsg(MBB, I);
     645             : 
     646         256 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
     647          76 :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
     648          16 :         EndPgmBlocks.push_back(&MBB);
     649             :     }
     650             : 
     651             :     // Wait for everything at the end of the MBB. If there is only one
     652             :     // successor, we can defer this until the uses there.
     653          28 :     if (!hasTrivialSuccessor(MBB))
     654          25 :       Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
     655             :   }
     656             : 
     657          15 :   if (HaveScalarStores) {
     658             :     // If scalar writes are used, the cache must be flushed or else the next
     659             :     // wave to reuse the same scratch memory can be clobbered.
     660             :     //
     661             :     // Insert s_dcache_wb at wave termination points if there were any scalar
     662             :     // stores, and only if the cache hasn't already been flushed. This could be
     663             :     // improved by looking across blocks for flushes in postdominating blocks
     664             :     // from the stores but an explicitly requested flush is probably very rare.
     665          26 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
     666           8 :       bool SeenDCacheWB = false;
     667             : 
     668          16 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
     669          26 :            I != E; ++I) {
     670          36 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
     671             :           SeenDCacheWB = true;
     672          32 :         else if (TII->isScalarStore(*I))
     673           7 :           SeenDCacheWB = false;
     674             : 
     675             :         // FIXME: It would be better to insert this before a waitcnt if any.
     676          29 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
     677          37 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
     678           7 :           Changes = true;
     679          28 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
     680             :         }
     681             :       }
     682             :     }
     683             :   }
     684             : 
     685          45 :   for (MachineInstr *I : RemoveMI)
     686           0 :     I->eraseFromParent();
     687             : 
     688          15 :   if (!MFI->isEntryFunction()) {
     689             :     // Wait for any outstanding memory operations that the input registers may
     690             :     // depend on. We can't track them and it's better to to the wait after the
     691             :     // costly call sequence.
     692             : 
     693             :     // TODO: Could insert earlier and schedule more liberally with operations
     694             :     // that only use caller preserved registers.
     695           2 :     MachineBasicBlock &EntryBB = MF.front();
     696          10 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
     697           2 :       .addImm(0);
     698             : 
     699           2 :     Changes = true;
     700             :   }
     701             : 
     702          30 :   return Changes;
     703             : }

Generated by: LCOV version 1.13