LCOV - llvm-toolchain.info - lib/Target/AMDGPU/SIInsertWaitcnts.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AMDGPU - SIInsertWaitcnts.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	483	643	75.1 %
Date:	2018-10-20 13:21:21	Functions:	19	45	42.2 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/DenseMap.h"
      27             : #include "llvm/ADT/DenseSet.h"
      28             : #include "llvm/ADT/PostOrderIterator.h"
      29             : #include "llvm/ADT/STLExtras.h"
      30             : #include "llvm/ADT/SmallVector.h"
      31             : #include "llvm/CodeGen/MachineBasicBlock.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineFunctionPass.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineLoopInfo.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineOperand.h"
      39             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      40             : #include "llvm/IR/DebugLoc.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Debug.h"
      43             : #include "llvm/Support/DebugCounter.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include "llvm/Support/raw_ostream.h"
      46             : #include <algorithm>
      47             : #include <cassert>
      48             : #include <cstdint>
      49             : #include <cstring>
      50             : #include <memory>
      51             : #include <utility>
      52             : #include <vector>
      53             : 
      54             : using namespace llvm;
      55             : 
      56             : #define DEBUG_TYPE "si-insert-waitcnts"
      57             : 
      58             : DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
      59             :               "Force emit s_waitcnt expcnt(0) instrs");
      60             : DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
      61             :               "Force emit s_waitcnt lgkmcnt(0) instrs");
      62             : DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
      63             :               "Force emit s_waitcnt vmcnt(0) instrs");
      64             : 
      65             : static cl::opt<unsigned> ForceEmitZeroFlag(
      66             :   "amdgpu-waitcnt-forcezero",
      67             :   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
      68             :   cl::init(0), cl::Hidden);
      69             : 
      70             : namespace {
      71             : 
      72             : // Class of object that encapsulates latest instruction counter score
      73             : // associated with the operand.  Used for determining whether
      74             : // s_waitcnt instruction needs to be emited.
      75             : 
      76             : #define CNT_MASK(t) (1u << (t))
      77             : 
      78             : enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
      79             : 
      80             : using RegInterval = std::pair<signed, signed>;
      81             : 
      82             : struct {
      83             :   int32_t VmcntMax;
      84             :   int32_t ExpcntMax;
      85             :   int32_t LgkmcntMax;
      86             :   int32_t NumVGPRsMax;
      87             :   int32_t NumSGPRsMax;
      88             : } HardwareLimits;
      89             : 
      90             : struct {
      91             :   unsigned VGPR0;
      92             :   unsigned VGPRL;
      93             :   unsigned SGPR0;
      94             :   unsigned SGPRL;
      95             : } RegisterEncoding;
      96             : 
      97             : enum WaitEventType {
      98             :   VMEM_ACCESS,      // vector-memory read & write
      99             :   LDS_ACCESS,       // lds read & write
     100             :   GDS_ACCESS,       // gds read & write
     101             :   SQ_MESSAGE,       // send message
     102             :   SMEM_ACCESS,      // scalar-memory read & write
     103             :   EXP_GPR_LOCK,     // export holding on its data src
     104             :   GDS_GPR_LOCK,     // GDS holding on its data and addr src
     105             :   EXP_POS_ACCESS,   // write to export position
     106             :   EXP_PARAM_ACCESS, // write to export parameter
     107             :   VMW_GPR_LOCK,     // vector-memory write holding on its data src
     108             :   NUM_WAIT_EVENTS,
     109             : };
     110             : 
     111             : // The mapping is:
     112             : //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
     113             : //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
     114             : //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
     115             : // We reserve a fixed number of VGPR slots in the scoring tables for
     116             : // special tokens like SCMEM_LDS (needed for buffer load to LDS).
     117             : enum RegisterMapping {
     118             :   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
     119             :   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
     120             :   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
     121             :   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
     122             :   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
     123             : };
     124             : 
     125             : #define ForAllWaitEventType(w)                                                 \
     126             :   for (enum WaitEventType w = (enum WaitEventType)0;                           \
     127             :        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
     128             :        (w) = (enum WaitEventType)((w) + 1))
     129             : 
     130             : // This is a per-basic-block object that maintains current score brackets
     131             : // of each wait counter, and a per-register scoreboard for each wait counter.
     132             : // We also maintain the latest score for every event type that can change the
     133             : // waitcnt in order to know if there are multiple types of events within
     134             : // the brackets. When multiple types of event happen in the bracket,
     135             : // wait count may get decreased out of order, therefore we need to put in
     136             : // "s_waitcnt 0" before use.
     137             : class BlockWaitcntBrackets {
     138             : public:
     139        8069 :   BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
     140       32276 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     141       24207 :          T = (enum InstCounterType)(T + 1)) {
     142       24207 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     143             :     }
     144        8069 :   }
     145             : 
     146             :   ~BlockWaitcntBrackets() = default;
     147             : 
     148             :   static int32_t getWaitCountMax(InstCounterType T) {
     149       40446 :     switch (T) {
     150       15484 :     case VM_CNT:
     151       15484 :       return HardwareLimits.VmcntMax;
     152       22821 :     case LGKM_CNT:
     153       22821 :       return HardwareLimits.LgkmcntMax;
     154        2141 :     case EXP_CNT:
     155        2141 :       return HardwareLimits.ExpcntMax;
     156             :     default:
     157             :       break;
     158             :     }
     159             :     return 0;
     160             :   }
     161             : 
     162             :   void setScoreLB(InstCounterType T, int32_t Val) {
     163             :     assert(T < NUM_INST_CNTS);
     164           0 :     if (T >= NUM_INST_CNTS)
     165             :       return;
     166       41157 :     ScoreLBs[T] = Val;
     167             :   }
     168             : 
     169             :   void setScoreUB(InstCounterType T, int32_t Val) {
     170             :     assert(T < NUM_INST_CNTS);
     171      108351 :     if (T >= NUM_INST_CNTS)
     172             :       return;
     173      176274 :     ScoreUBs[T] = Val;
     174      108351 :     if (T == EXP_CNT) {
     175       33282 :       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
     176       33282 :       if (ScoreLBs[T] < UB)
     177        1115 :         ScoreLBs[T] = UB;
     178             :     }
     179             :   }
     180             : 
     181             :   int32_t getScoreLB(InstCounterType T) {
     182             :     assert(T < NUM_INST_CNTS);
     183     2870446 :     if (T >= NUM_INST_CNTS)
     184             :       return 0;
     185     2920816 :     return ScoreLBs[T];
     186             :   }
     187             : 
     188             :   int32_t getScoreUB(InstCounterType T) {
     189             :     assert(T < NUM_INST_CNTS);
     190     2978797 :     if (T >= NUM_INST_CNTS)
     191             :       return 0;
     192     2981359 :     return ScoreUBs[T];
     193             :   }
     194             : 
     195             :   // Mapping from event to counter.
     196           0 :   InstCounterType eventCounter(WaitEventType E) {
     197           0 :     switch (E) {
     198             :     case VMEM_ACCESS:
     199             :       return VM_CNT;
     200           0 :     case LDS_ACCESS:
     201             :     case GDS_ACCESS:
     202             :     case SQ_MESSAGE:
     203             :     case SMEM_ACCESS:
     204           0 :       return LGKM_CNT;
     205           0 :     case EXP_GPR_LOCK:
     206             :     case GDS_GPR_LOCK:
     207             :     case VMW_GPR_LOCK:
     208             :     case EXP_POS_ACCESS:
     209             :     case EXP_PARAM_ACCESS:
     210           0 :       return EXP_CNT;
     211           0 :     default:
     212           0 :       llvm_unreachable("unhandled event type");
     213             :     }
     214             :     return NUM_INST_CNTS;
     215             :   }
     216             : 
     217             :   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
     218      148481 :     if (GprNo < NUM_ALL_VGPRS) {
     219       89110 :       if (GprNo > VgprUB) {
     220       17583 :         VgprUB = GprNo;
     221             :       }
     222       89110 :       VgprScores[T][GprNo] = Val;
     223             :     } else {
     224             :       assert(T == LGKM_CNT);
     225       72282 :       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
     226       20453 :         SgprUB = GprNo - NUM_ALL_VGPRS;
     227             :       }
     228       72282 :       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     229             :     }
     230             :   }
     231             : 
     232             :   int32_t getRegScore(int GprNo, InstCounterType T) {
     233     2715679 :     if (GprNo < NUM_ALL_VGPRS) {
     234     1592868 :       return VgprScores[T][GprNo];
     235             :     }
     236     1337011 :     return SgprScores[GprNo - NUM_ALL_VGPRS];
     237             :   }
     238             : 
     239       22641 :   void clear() {
     240       22641 :     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     241       22641 :     memset(ScoreUBs, 0, sizeof(ScoreUBs));
     242       22641 :     memset(EventUBs, 0, sizeof(EventUBs));
     243       90564 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     244       67923 :          T = (enum InstCounterType)(T + 1)) {
     245       67923 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     246             :     }
     247       22641 :     memset(SgprScores, 0, sizeof(SgprScores));
     248       22641 :   }
     249             : 
     250             :   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
     251             :                              const MachineRegisterInfo *MRI,
     252             :                              const SIRegisterInfo *TRI, unsigned OpNo,
     253             :                              bool Def) const;
     254             : 
     255             :   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
     256             :                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
     257             :                    unsigned OpNo, int32_t Val);
     258             : 
     259             :   void setWaitAtBeginning() { WaitAtBeginning = true; }
     260           0 :   void clearWaitAtBeginning() { WaitAtBeginning = false; }
     261           0 :   bool getWaitAtBeginning() const { return WaitAtBeginning; }
     262        2572 :   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
     263           0 :   int32_t getMaxVGPR() const { return VgprUB; }
     264           0 :   int32_t getMaxSGPR() const { return SgprUB; }
     265             : 
     266             :   int32_t getEventUB(enum WaitEventType W) const {
     267             :     assert(W < NUM_WAIT_EVENTS);
     268       50370 :     return EventUBs[W];
     269             :   }
     270             : 
     271             :   bool counterOutOfOrder(InstCounterType T);
     272             :   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
     273             :   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
     274             :                      const MachineRegisterInfo *MRI, WaitEventType E,
     275             :                      MachineInstr &MI);
     276             : 
     277             :   bool hasPendingSMEM() const {
     278          25 :     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     279             :             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
     280             :   }
     281             : 
     282             :   bool hasPendingFlat() const {
     283       39404 :     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
     284       38156 :              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
     285       36908 :             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
     286          14 :              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
     287             :   }
     288             : 
     289             :   void setPendingFlat() {
     290        1623 :     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
     291        1623 :     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
     292             :   }
     293             : 
     294       15111 :   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
     295             : 
     296       67923 :   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
     297             : 
     298           0 :   bool getRevisitLoop() const { return RevisitLoop; }
     299         295 :   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
     300             : 
     301       22641 :   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
     302           0 :   int32_t getPostOrder() const { return PostOrder; }
     303             : 
     304           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
     305           0 :   void clearWaitcnt() { Waitcnt = nullptr; }
     306           0 :   MachineInstr *getWaitcnt() const { return Waitcnt; }
     307             : 
     308           0 :   bool mixedExpTypes() const { return MixedExpTypes; }
     309             :   void setMixedExpTypes(bool MixedExpTypesIn) {
     310       24664 :     MixedExpTypes = MixedExpTypesIn;
     311             :   }
     312             : 
     313             :   void print(raw_ostream &);
     314             :   void dump() { print(dbgs()); }
     315             : 
     316             : private:
     317             :   const GCNSubtarget *ST = nullptr;
     318             :   bool WaitAtBeginning = false;
     319             :   bool RevisitLoop = false;
     320             :   bool MixedExpTypes = false;
     321             :   int32_t PostOrder = 0;
     322             :   MachineInstr *Waitcnt = nullptr;
     323             :   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
     324             :   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
     325             :   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
     326             :   // Remember the last flat memory operation.
     327             :   int32_t LastFlat[NUM_INST_CNTS] = {0};
     328             :   // wait_cnt scores for every vgpr.
     329             :   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
     330             :   int32_t VgprUB = 0;
     331             :   int32_t SgprUB = 0;
     332             :   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
     333             :   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
     334             :   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
     335             : };
     336             : 
     337             : // This is a per-loop-region object that records waitcnt status at the end of
     338             : // loop footer from the previous iteration. We also maintain an iteration
     339             : // count to track the number of times the loop has been visited. When it
     340             : // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
     341             : // at the end of the loop footer.
     342             : class LoopWaitcntData {
     343             : public:
     344           0 :   LoopWaitcntData() = default;
     345             :   ~LoopWaitcntData() = default;
     346             : 
     347         248 :   void incIterCnt() { IterCnt++; }
     348           0 :   void resetIterCnt() { IterCnt = 0; }
     349           0 :   unsigned getIterCnt() { return IterCnt; }
     350             : 
     351           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
     352           0 :   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
     353             : 
     354           0 :   void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
     355             : 
     356             : private:
     357             :   // s_waitcnt added at the end of loop footer to stablize wait scores
     358             :   // at the end of the loop footer.
     359             :   MachineInstr *LfWaitcnt = nullptr;
     360             :   // Number of iterations the loop has been visited, not including the initial
     361             :   // walk over.
     362             :   int32_t IterCnt = 0;
     363             : };
     364             : 
     365             : class SIInsertWaitcnts : public MachineFunctionPass {
     366             : private:
     367             :   const GCNSubtarget *ST = nullptr;
     368             :   const SIInstrInfo *TII = nullptr;
     369             :   const SIRegisterInfo *TRI = nullptr;
     370             :   const MachineRegisterInfo *MRI = nullptr;
     371             :   const MachineLoopInfo *MLI = nullptr;
     372             :   AMDGPU::IsaVersion IV;
     373             : 
     374             :   DenseSet<MachineBasicBlock *> BlockVisitedSet;
     375             :   DenseSet<MachineInstr *> TrackedWaitcntSet;
     376             :   DenseSet<MachineInstr *> VCCZBugHandledSet;
     377             : 
     378             :   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
     379             :       BlockWaitcntBracketsMap;
     380             : 
     381             :   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
     382             : 
     383             :   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
     384             : 
     385             :   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
     386             : 
     387             :   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
     388             :   // because of amdgpu-waitcnt-forcezero flag
     389             :   bool ForceEmitZeroWaitcnts;
     390             :   bool ForceEmitWaitcnt[NUM_INST_CNTS];
     391             : 
     392             : public:
     393             :   static char ID;
     394             : 
     395        3948 :   SIInsertWaitcnts() : MachineFunctionPass(ID) {
     396             :     (void)ForceExpCounter;
     397             :     (void)ForceLgkmCounter;
     398             :     (void)ForceVMCounter;
     399        1974 :   }
     400             : 
     401             :   bool runOnMachineFunction(MachineFunction &MF) override;
     402             : 
     403        1962 :   StringRef getPassName() const override {
     404        1962 :     return "SI insert wait instructions";
     405             :   }
     406             : 
     407        1962 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     408        1962 :     AU.setPreservesCFG();
     409             :     AU.addRequired<MachineLoopInfo>();
     410        1962 :     MachineFunctionPass::getAnalysisUsage(AU);
     411        1962 :   }
     412             : 
     413          72 :   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     414             :     // The waitcnt information is copied because it changes as the block is
     415             :     // traversed.
     416          72 :     KillWaitBrackets.push_back(
     417          72 :         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
     418          72 :   }
     419             : 
     420             :   bool isForceEmitWaitcnt() const {
     421     1306196 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     422      979647 :          T = (enum InstCounterType)(T + 1))
     423      979647 :       if (ForceEmitWaitcnt[T])
     424             :         return true;
     425             :     return false;
     426             :   }
     427             : 
     428           0 :   void setForceEmitWaitcnt() {
     429             : // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
     430             : // For debug builds, get the debug counter info and adjust if need be
     431             : #ifndef NDEBUG
     432             :     if (DebugCounter::isCounterSet(ForceExpCounter) &&
     433             :         DebugCounter::shouldExecute(ForceExpCounter)) {
     434             :       ForceEmitWaitcnt[EXP_CNT] = true;
     435             :     } else {
     436             :       ForceEmitWaitcnt[EXP_CNT] = false;
     437             :     }
     438             : 
     439             :     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
     440             :          DebugCounter::shouldExecute(ForceLgkmCounter)) {
     441             :       ForceEmitWaitcnt[LGKM_CNT] = true;
     442             :     } else {
     443             :       ForceEmitWaitcnt[LGKM_CNT] = false;
     444             :     }
     445             : 
     446             :     if (DebugCounter::isCounterSet(ForceVMCounter) &&
     447             :         DebugCounter::shouldExecute(ForceVMCounter)) {
     448             :       ForceEmitWaitcnt[VM_CNT] = true;
     449             :     } else {
     450             :       ForceEmitWaitcnt[VM_CNT] = false;
     451             :     }
     452             : #endif // NDEBUG
     453           0 :   }
     454             : 
     455             :   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
     456             :   void generateWaitcntInstBefore(MachineInstr &MI,
     457             :                                   BlockWaitcntBrackets *ScoreBrackets);
     458             :   void updateEventWaitcntAfter(MachineInstr &Inst,
     459             :                                BlockWaitcntBrackets *ScoreBrackets);
     460             :   void mergeInputScoreBrackets(MachineBasicBlock &Block);
     461             :   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
     462             :   unsigned countNumBottomBlocks(const MachineLoop *Loop);
     463             :   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
     464             :   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
     465             :   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
     466             :   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
     467             : };
     468             : 
     469             : } // end anonymous namespace
     470             : 
     471     3496073 : RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
     472             :                                                  const SIInstrInfo *TII,
     473             :                                                  const MachineRegisterInfo *MRI,
     474             :                                                  const SIRegisterInfo *TRI,
     475             :                                                  unsigned OpNo,
     476             :                                                  bool Def) const {
     477     3496073 :   const MachineOperand &Op = MI->getOperand(OpNo);
     478     3496073 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
     479     1274546 :       (Def && !Op.isDef()))
     480     2182179 :     return {-1, -1};
     481             : 
     482             :   // A use via a PW operand does not need a waitcnt.
     483             :   // A partial write is not a WAW.
     484             :   assert(!Op.getSubReg() || !Op.isUndef());
     485             : 
     486             :   RegInterval Result;
     487             :   const MachineRegisterInfo &MRIA = *MRI;
     488             : 
     489     1313894 :   unsigned Reg = TRI->getEncodingValue(Op.getReg());
     490             : 
     491     1313894 :   if (TRI->isVGPR(MRIA, Op.getReg())) {
     492             :     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     493      561341 :     Result.first = Reg - RegisterEncoding.VGPR0;
     494             :     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     495      752553 :   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
     496             :     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     497      752553 :     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     498             :     assert(Result.first >= NUM_ALL_VGPRS &&
     499             :            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
     500             :   }
     501             :   // TODO: Handle TTMP
     502             :   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
     503             :   else
     504           0 :     return {-1, -1};
     505             : 
     506             :   const MachineInstr &MIA = *MI;
     507     1313894 :   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
     508             :   unsigned Size = TRI->getRegSizeInBits(*RC);
     509     1313894 :   Result.second = Result.first + (Size / 32);
     510             : 
     511     1313894 :   return Result;
     512             : }
     513             : 
     514       12144 : void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
     515             :                                        const SIInstrInfo *TII,
     516             :                                        const SIRegisterInfo *TRI,
     517             :                                        const MachineRegisterInfo *MRI,
     518             :                                        unsigned OpNo, int32_t Val) {
     519       12144 :   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
     520             :   LLVM_DEBUG({
     521             :     const MachineOperand &Opnd = MI->getOperand(OpNo);
     522             :     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
     523             :   });
     524       32561 :   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     525             :     setRegScore(RegNo, EXP_CNT, Val);
     526             :   }
     527       12144 : }
     528             : 
     529      108351 : void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     530             :                                          const SIRegisterInfo *TRI,
     531             :                                          const MachineRegisterInfo *MRI,
     532             :                                          WaitEventType E, MachineInstr &Inst) {
     533             :   const MachineRegisterInfo &MRIA = *MRI;
     534             :   InstCounterType T = eventCounter(E);
     535      108351 :   int32_t CurrScore = getScoreUB(T) + 1;
     536             :   // EventUB and ScoreUB need to be update regardless if this event changes
     537             :   // the score of a register or not.
     538             :   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
     539      108351 :   EventUBs[E] = CurrScore;
     540             :   setScoreUB(T, CurrScore);
     541             : 
     542      108351 :   if (T == EXP_CNT) {
     543             :     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
     544             :     // is required.
     545       10641 :     if (!MixedExpTypes) {
     546       10641 :       MixedExpTypes = counterOutOfOrder(EXP_CNT);
     547             :     }
     548             : 
     549             :     // Put score on the source vgprs. If this is a store, just use those
     550             :     // specific register(s).
     551       10641 :     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
     552             :       // All GDS operations must protect their address register (same as
     553             :       // export.)
     554           0 :       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
     555             :           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
     556           0 :         setExpScore(
     557             :             &Inst, TII, TRI, MRI,
     558           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
     559             :             CurrScore);
     560             :       }
     561           0 :       if (Inst.mayStore()) {
     562           0 :         setExpScore(
     563             :             &Inst, TII, TRI, MRI,
     564           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
     565             :             CurrScore);
     566           0 :         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     567             :                                        AMDGPU::OpName::data1) != -1) {
     568           0 :           setExpScore(&Inst, TII, TRI, MRI,
     569             :                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     570             :                                                  AMDGPU::OpName::data1),
     571             :                       CurrScore);
     572             :         }
     573           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
     574           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
     575           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
     576           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
     577           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
     578           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
     579           0 :                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
     580           0 :                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
     581             :                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
     582           0 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     583           0 :           const MachineOperand &Op = Inst.getOperand(I);
     584           0 :           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
     585           0 :             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     586             :           }
     587             :         }
     588             :       }
     589       10641 :     } else if (TII->isFLAT(Inst)) {
     590           0 :       if (Inst.mayStore()) {
     591           0 :         setExpScore(
     592             :             &Inst, TII, TRI, MRI,
     593           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     594             :             CurrScore);
     595           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     596           0 :         setExpScore(
     597             :             &Inst, TII, TRI, MRI,
     598           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     599             :             CurrScore);
     600             :       }
     601       10641 :     } else if (TII->isMIMG(Inst)) {
     602          47 :       if (Inst.mayStore()) {
     603          47 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     604           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     605           0 :         setExpScore(
     606             :             &Inst, TII, TRI, MRI,
     607           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     608             :             CurrScore);
     609             :       }
     610       10594 :     } else if (TII->isMTBUF(Inst)) {
     611          46 :       if (Inst.mayStore()) {
     612          46 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     613             :       }
     614       10548 :     } else if (TII->isMUBUF(Inst)) {
     615       10047 :       if (Inst.mayStore()) {
     616       10047 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     617           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     618           0 :         setExpScore(
     619             :             &Inst, TII, TRI, MRI,
     620           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     621             :             CurrScore);
     622             :       }
     623             :     } else {
     624         501 :       if (TII->isEXP(Inst)) {
     625             :         // For export the destination registers are really temps that
     626             :         // can be used as the actual source after export patching, so
     627             :         // we need to treat them like sources and set the EXP_CNT
     628             :         // score.
     629        5010 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     630        4509 :           MachineOperand &DefMO = Inst.getOperand(I);
     631        4509 :           if (DefMO.isReg() && DefMO.isDef() &&
     632           0 :               TRI->isVGPR(MRIA, DefMO.getReg())) {
     633           0 :             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
     634             :                         CurrScore);
     635             :           }
     636             :         }
     637             :       }
     638        5010 :       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     639        4509 :         MachineOperand &MO = Inst.getOperand(I);
     640        4509 :         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
     641        2004 :           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     642             :         }
     643             :       }
     644             :     }
     645             : #if 0 // TODO: check if this is handled by MUBUF code above.
     646             :   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
     647             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
     648             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     649             :     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     650             :     unsigned OpNo;//TODO: find the OpNo for this operand;
     651             :     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     652             :     for (signed RegNo = Interval.first; RegNo < Interval.second;
     653             :     ++RegNo) {
     654             :       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     655             :     }
     656             : #endif
     657             :   } else {
     658             :     // Match the score to the destination registers.
     659      744365 :     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     660      646655 :       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
     661      646655 :       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
     662           0 :         continue;
     663      774719 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     664             :         setRegScore(RegNo, T, CurrScore);
     665             :       }
     666             :     }
     667       97710 :     if (TII->isDS(Inst) && Inst.mayStore()) {
     668             :       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     669             :     }
     670             :   }
     671      108351 : }
     672             : 
     673             : void BlockWaitcntBrackets::print(raw_ostream &OS) {
     674             :   OS << '\n';
     675             :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     676             :        T = (enum InstCounterType)(T + 1)) {
     677             :     int LB = getScoreLB(T);
     678             :     int UB = getScoreUB(T);
     679             : 
     680             :     switch (T) {
     681             :     case VM_CNT:
     682             :       OS << "    VM_CNT(" << UB - LB << "): ";
     683             :       break;
     684             :     case LGKM_CNT:
     685             :       OS << "    LGKM_CNT(" << UB - LB << "): ";
     686             :       break;
     687             :     case EXP_CNT:
     688             :       OS << "    EXP_CNT(" << UB - LB << "): ";
     689             :       break;
     690             :     default:
     691             :       OS << "    UNKNOWN(" << UB - LB << "): ";
     692             :       break;
     693             :     }
     694             : 
     695             :     if (LB < UB) {
     696             :       // Print vgpr scores.
     697             :       for (int J = 0; J <= getMaxVGPR(); J++) {
     698             :         int RegScore = getRegScore(J, T);
     699             :         if (RegScore <= LB)
     700             :           continue;
     701             :         int RelScore = RegScore - LB - 1;
     702             :         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
     703             :           OS << RelScore << ":v" << J << " ";
     704             :         } else {
     705             :           OS << RelScore << ":ds ";
     706             :         }
     707             :       }
     708             :       // Also need to print sgpr scores for lgkm_cnt.
     709             :       if (T == LGKM_CNT) {
     710             :         for (int J = 0; J <= getMaxSGPR(); J++) {
     711             :           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
     712             :           if (RegScore <= LB)
     713             :             continue;
     714             :           int RelScore = RegScore - LB - 1;
     715             :           OS << RelScore << ":s" << J << " ";
     716             :         }
     717             :       }
     718             :     }
     719             :     OS << '\n';
     720             :   }
     721             :   OS << '\n';
     722             : }
     723             : 
     724     2870446 : unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
     725             :                                                 int ScoreToWait) {
     726             :   unsigned int NeedWait = 0;
     727     2870446 :   if (ScoreToWait == -1) {
     728             :     // The score to wait is unknown. This implies that it was not encountered
     729             :     // during the path of the CFG walk done during the current traversal but
     730             :     // may be seen on a different path. Emit an s_wait counter with a
     731             :     // conservative value of 0 for the counter.
     732           0 :     NeedWait = CNT_MASK(T);
     733             :     setScoreLB(T, getScoreUB(T));
     734           0 :     return NeedWait;
     735             :   }
     736             : 
     737             :   // If the score of src_operand falls within the bracket, we need an
     738             :   // s_waitcnt instruction.
     739             :   const int32_t LB = getScoreLB(T);
     740             :   const int32_t UB = getScoreUB(T);
     741     2870446 :   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     742             :     if ((T == VM_CNT || T == LGKM_CNT) &&
     743       42419 :         hasPendingFlat() &&
     744        1262 :         !ST->hasFlatLgkmVMemCountInOrder()) {
     745             :       // If there is a pending FLAT operation, and this is a VMem or LGKM
     746             :       // waitcnt and the target can report early completion, then we need
     747             :       // to force a waitcnt 0.
     748        1262 :       NeedWait = CNT_MASK(T);
     749             :       setScoreLB(T, getScoreUB(T));
     750       39895 :     } else if (counterOutOfOrder(T)) {
     751             :       // Counter can get decremented out-of-order when there
     752             :       // are multiple types event in the bracket. Also emit an s_wait counter
     753             :       // with a conservative value of 0 for the counter.
     754       17087 :       NeedWait = CNT_MASK(T);
     755             :       setScoreLB(T, getScoreUB(T));
     756             :     } else {
     757       22808 :       NeedWait = CNT_MASK(T);
     758             :       setScoreLB(T, ScoreToWait);
     759             :     }
     760             :   }
     761             : 
     762             :   return NeedWait;
     763             : }
     764             : 
     765             : // Where there are multiple types of event in the bracket of a counter,
     766             : // the decrement may go out of order.
     767       50536 : bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
     768       50536 :   switch (T) {
     769             :   case VM_CNT:
     770             :     return false;
     771       22377 :   case LGKM_CNT: {
     772       22377 :     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     773       17087 :         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     774             :       // Scalar memory read always can go out of order.
     775             :       return true;
     776             :     }
     777             :     int NumEventTypes = 0;
     778        5290 :     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     779        5290 :         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     780             :       NumEventTypes++;
     781             :     }
     782        5290 :     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     783           0 :         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     784           0 :       NumEventTypes++;
     785             :     }
     786        5290 :     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
     787           0 :         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
     788           0 :       NumEventTypes++;
     789             :     }
     790        5290 :     if (NumEventTypes <= 1) {
     791             :       return false;
     792             :     }
     793             :     break;
     794             :   }
     795       13642 :   case EXP_CNT: {
     796             :     // If there has been a mixture of export types, then a waitcnt exp(0) is
     797             :     // required.
     798       13642 :     if (MixedExpTypes)
     799             :       return true;
     800             :     int NumEventTypes = 0;
     801       13642 :     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     802         353 :         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     803             :       NumEventTypes++;
     804             :     }
     805       13642 :     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     806           0 :         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     807           0 :       NumEventTypes++;
     808             :     }
     809       13642 :     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     810       13128 :         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     811       13128 :       NumEventTypes++;
     812             :     }
     813       13642 :     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
     814         128 :         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
     815         128 :       NumEventTypes++;
     816             :     }
     817             : 
     818       13642 :     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
     819          41 :         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
     820          41 :       NumEventTypes++;
     821             :     }
     822             : 
     823       13642 :     if (NumEventTypes <= 1) {
     824             :       return false;
     825             :     }
     826             :     break;
     827             :   }
     828             :   default:
     829             :     break;
     830             :   }
     831           8 :   return true;
     832             : }
     833             : 
     834       85105 : INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     835             :                       false)
     836      199024 : INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     837             :                     false)
     838             : 
     839             : char SIInsertWaitcnts::ID = 0;
     840             : 
     841             : char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
     842             : 
     843        1964 : FunctionPass *llvm::createSIInsertWaitcntsPass() {
     844        1964 :   return new SIInsertWaitcnts();
     845             : }
     846             : 
     847             : static bool readsVCCZ(const MachineInstr &MI) {
     848      326530 :   unsigned Opc = MI.getOpcode();
     849      326530 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     850         293 :          !MI.getOperand(1).isUndef();
     851             : }
     852             : 
     853             : /// Given wait count encodings checks if LHS is stronger than RHS.
     854        1160 : bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
     855        1160 :   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
     856             :     return false;
     857        1160 :   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
     858             :     return false;
     859        1159 :   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
     860           0 :     return false;
     861             :   return true;
     862             : }
     863             : 
     864             : /// Given wait count encodings create a new encoding which is stronger
     865             : /// or equal to both.
     866           0 : unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
     867           0 :   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
     868           0 :                             AMDGPU::decodeVmcnt(IV, RHS));
     869           0 :   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
     870           0 :                               AMDGPU::decodeLgkmcnt(IV, RHS));
     871           0 :   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
     872           0 :                              AMDGPU::decodeExpcnt(IV, RHS));
     873           0 :   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
     874             : }
     875             : 
     876             : ///  Generate s_waitcnt instruction to be placed before cur_Inst.
     877             : ///  Instructions of a given type are returned in order,
     878             : ///  but instructions of different types can complete out of order.
     879             : ///  We rely on this in-order completion
     880             : ///  and simply assign a score to the memory access instructions.
     881             : ///  We keep track of the active "score bracket" to determine
     882             : ///  if an access of a memory read requires an s_waitcnt
     883             : ///  and if so what the value of each counter is.
     884             : ///  The "score bracket" is bound by the lower bound and upper bound
     885             : ///  scores (*_score_LB and *_score_ub respectively).
     886      326549 : void SIInsertWaitcnts::generateWaitcntInstBefore(
     887             :     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
     888             :   // To emit, or not to emit - that's the question!
     889             :   // Start with an assumption that there is no need to emit.
     890             :   unsigned int EmitWaitcnt = 0;
     891             : 
     892             :   // No need to wait before phi. If a phi-move exists, then the wait should
     893             :   // has been inserted before the move. If a phi-move does not exist, then
     894             :   // wait should be inserted before the real use. The same is true for
     895             :   // sc-merge. It is not a coincident that all these cases correspond to the
     896             :   // instructions that are skipped in the assembling loop.
     897             :   bool NeedLineMapping = false; // TODO: Check on this.
     898             : 
     899             :   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
     900             :   bool ForceEmitZeroWaitcnt = false;
     901             : 
     902             :   setForceEmitWaitcnt();
     903             :   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
     904             : 
     905             :   if (MI.isDebugInstr() &&
     906             :       // TODO: any other opcode?
     907             :       !NeedLineMapping) {
     908             :     return;
     909             :   }
     910             : 
     911             :   // See if an s_waitcnt is forced at block entry, or is needed at
     912             :   // program end.
     913      326530 :   if (ScoreBrackets->getWaitAtBeginning()) {
     914             :     // Note that we have already cleared the state, so we don't need to update
     915             :     // it.
     916             :     ScoreBrackets->clearWaitAtBeginning();
     917           0 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     918           0 :          T = (enum InstCounterType)(T + 1)) {
     919           0 :       EmitWaitcnt |= CNT_MASK(T);
     920             :       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     921             :     }
     922             :   }
     923             : 
     924             :   // See if this instruction has a forced S_WAITCNT VM.
     925             :   // TODO: Handle other cases of NeedsWaitcntVmBefore()
     926      326402 :   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
     927      652931 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
     928             :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     929             :     EmitWaitcnt |=
     930        1174 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     931             :   }
     932             : 
     933             :   // All waits must be resolved at call return.
     934             :   // NOTE: this could be improved with knowledge of all call sites or
     935             :   //   with knowledge of the called routines.
     936      653060 :   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
     937             :       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     938       11324 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     939        8493 :          T = (enum InstCounterType)(T + 1)) {
     940        8493 :       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
     941             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     942        1916 :         EmitWaitcnt |= CNT_MASK(T);
     943             :       }
     944             :     }
     945             :   }
     946             :   // Resolve vm waits before gs-done.
     947      323683 :   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
     948      323699 :             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
     949          28 :            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
     950             :             AMDGPU::SendMsg::ID_GS_DONE)) {
     951           8 :     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
     952             :       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     953           0 :       EmitWaitcnt |= CNT_MASK(VM_CNT);
     954             :     }
     955             :   }
     956             : #if 0 // TODO: the following blocks of logic when we have fence.
     957             :   else if (MI.getOpcode() == SC_FENCE) {
     958             :     const unsigned int group_size =
     959             :       context->shader_info->GetMaxThreadGroupSize();
     960             :     // group_size == 0 means thread group size is unknown at compile time
     961             :     const bool group_is_multi_wave =
     962             :       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
     963             :     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
     964             : 
     965             :     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
     966             :       SCRegType src_type = Inst->GetSrcType(i);
     967             :       switch (src_type) {
     968             :         case SCMEM_LDS:
     969             :           if (group_is_multi_wave ||
     970             :             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
     971             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     972             :                                ScoreBrackets->getScoreUB(LGKM_CNT));
     973             :             // LDS may have to wait for VM_CNT after buffer load to LDS
     974             :             if (target_info->HasBufferLoadToLDS()) {
     975             :               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     976             :                                  ScoreBrackets->getScoreUB(VM_CNT));
     977             :             }
     978             :           }
     979             :           break;
     980             : 
     981             :         case SCMEM_GDS:
     982             :           if (group_is_multi_wave || fence_is_global) {
     983             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     984             :               ScoreBrackets->getScoreUB(EXP_CNT));
     985             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     986             :               ScoreBrackets->getScoreUB(LGKM_CNT));
     987             :           }
     988             :           break;
     989             : 
     990             :         case SCMEM_UAV:
     991             :         case SCMEM_TFBUF:
     992             :         case SCMEM_RING:
     993             :         case SCMEM_SCATTER:
     994             :           if (group_is_multi_wave || fence_is_global) {
     995             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     996             :               ScoreBrackets->getScoreUB(EXP_CNT));
     997             :             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     998             :               ScoreBrackets->getScoreUB(VM_CNT));
     999             :           }
    1000             :           break;
    1001             : 
    1002             :         case SCMEM_SCRATCH:
    1003             :         default:
    1004             :           break;
    1005             :       }
    1006             :     }
    1007             :   }
    1008             : #endif
    1009             : 
    1010             :   // Export & GDS instructions do not read the EXEC mask until after the export
    1011             :   // is granted (which can occur well after the instruction is issued).
    1012             :   // The shader program must flush all EXP operations on the export-count
    1013             :   // before overwriting the EXEC mask.
    1014             :   else {
    1015      323691 :     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
    1016             :       // Export and GDS are tracked individually, either may trigger a waitcnt
    1017             :       // for EXEC.
    1018        2889 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1019             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
    1020        2889 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1021             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
    1022        2889 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1023             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
    1024        2889 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1025             :           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
    1026             :     }
    1027             : 
    1028             : #if 0 // TODO: the following code to handle CALL.
    1029             :     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
    1030             :     // However, there is a problem with EXP_CNT, because the call cannot
    1031             :     // easily tell if a register is used in the function, and if it did, then
    1032             :     // the referring instruction would have to have an S_WAITCNT, which is
    1033             :     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
    1034             :     // before the call.
    1035             :     if (MI.getOpcode() == SC_CALL) {
    1036             :       if (ScoreBrackets->getScoreUB(EXP_CNT) >
    1037             :         ScoreBrackets->getScoreLB(EXP_CNT)) {
    1038             :         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1039             :         EmitWaitcnt |= CNT_MASK(EXP_CNT);
    1040             :       }
    1041             :     }
    1042             : #endif
    1043             : 
    1044             :     // FIXME: Should not be relying on memoperands.
    1045             :     // Look at the source operands of every instruction to see if
    1046             :     // any of them results from a previous memory operation that affects
    1047             :     // its current usage. If so, an s_waitcnt instruction needs to be
    1048             :     // emitted.
    1049             :     // If the source operand was defined by a load, add the s_waitcnt
    1050             :     // instruction.
    1051      413849 :     for (const MachineMemOperand *Memop : MI.memoperands()) {
    1052             :       unsigned AS = Memop->getAddrSpace();
    1053       90158 :       if (AS != AMDGPUAS::LOCAL_ADDRESS)
    1054             :         continue;
    1055             :       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1056             :       // VM_CNT is only relevant to vgpr or LDS.
    1057       11666 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1058             :           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1059             :     }
    1060             : 
    1061     1742328 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1062     1418637 :       const MachineOperand &Op = MI.getOperand(I);
    1063     1418637 :       const MachineRegisterInfo &MRIA = *MRI;
    1064             :       RegInterval Interval =
    1065     1418637 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
    1066     2984875 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1067     1566238 :         if (TRI->isVGPR(MRIA, Op.getReg())) {
    1068             :           // VM_CNT is only relevant to vgpr or LDS.
    1069      449977 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1070             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1071             :         }
    1072     1566238 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1073             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1074             :       }
    1075             :     }
    1076             :     // End of for loop that looks at all source operands to decide vm_wait_cnt
    1077             :     // and lgk_wait_cnt.
    1078             : 
    1079             :     // Two cases are handled for destination operands:
    1080             :     // 1) If the destination operand was defined by a load, add the s_waitcnt
    1081             :     // instruction to guarantee the right WAW order.
    1082             :     // 2) If a destination operand that was used by a recent export/store ins,
    1083             :     // add s_waitcnt on exp_cnt to guarantee the WAR order.
    1084      323691 :     if (MI.mayStore()) {
    1085             :       // FIXME: Should not be relying on memoperands.
    1086       77559 :       for (const MachineMemOperand *Memop : MI.memoperands()) {
    1087             :         unsigned AS = Memop->getAddrSpace();
    1088       38874 :         if (AS != AMDGPUAS::LOCAL_ADDRESS)
    1089             :           continue;
    1090             :         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1091        7081 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1092             :             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1093        7081 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1094             :             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1095             :       }
    1096             :     }
    1097     1742328 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1098     1418637 :       MachineOperand &Def = MI.getOperand(I);
    1099     1418637 :       const MachineRegisterInfo &MRIA = *MRI;
    1100             :       RegInterval Interval =
    1101     1418637 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
    1102     1832541 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1103      413904 :         if (TRI->isVGPR(MRIA, Def.getReg())) {
    1104      200784 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1105             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1106      200784 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1107             :               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1108             :         }
    1109      413904 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1110             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1111             :       }
    1112             :     } // End of for loop that looks at all dest operands.
    1113             :   }
    1114             : 
    1115             :   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
    1116             :   // occurs before the instruction. Doing it here prevents any additional
    1117             :   // S_WAITCNTs from being emitted if the instruction was marked as
    1118             :   // requiring a WAITCNT beforehand.
    1119      653060 :   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
    1120          69 :       !ST->hasAutoWaitcntBeforeBarrier()) {
    1121          67 :     EmitWaitcnt |=
    1122          67 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1123          67 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1124             :         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1125          67 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1126             :         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
    1127             :   }
    1128             : 
    1129             :   // TODO: Remove this work-around, enable the assert for Bug 457939
    1130             :   //       after fixing the scheduler. Also, the Shader Compiler code is
    1131             :   //       independent of target.
    1132         292 :   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
    1133             :     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1134         195 :             ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1135             :         ScoreBrackets->hasPendingSMEM()) {
    1136             :       // Wait on everything, not just LGKM.  vccz reads usually come from
    1137             :       // terminators, and we always wait on everything at the end of the
    1138             :       // block, so if we only wait on LGKM here, we might end up with
    1139             :       // another s_waitcnt inserted right after this if there are non-LGKM
    1140             :       // instructions still outstanding.
    1141             :       // FIXME: this is too conservative / the comment is wrong.
    1142             :       // We don't wait on everything at the end of the block and we combine
    1143             :       // waitcnts so we should never have back-to-back waitcnts.
    1144             :       ForceEmitZeroWaitcnt = true;
    1145             :       EmitWaitcnt = true;
    1146             :     }
    1147             :   }
    1148             : 
    1149             :   // Does this operand processing indicate s_wait counter update?
    1150      326530 :   if (EmitWaitcnt || IsForceEmitWaitcnt) {
    1151             :     int CntVal[NUM_INST_CNTS];
    1152             : 
    1153             :     bool UseDefaultWaitcntStrategy = true;
    1154       37286 :     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
    1155             :       // Force all waitcnts to 0.
    1156          16 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1157          12 :            T = (enum InstCounterType)(T + 1)) {
    1158             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1159             :       }
    1160           4 :       CntVal[VM_CNT] = 0;
    1161           4 :       CntVal[EXP_CNT] = 0;
    1162           4 :       CntVal[LGKM_CNT] = 0;
    1163             :       UseDefaultWaitcntStrategy = false;
    1164             :     }
    1165             : 
    1166             :     if (UseDefaultWaitcntStrategy) {
    1167      149128 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1168      111846 :            T = (enum InstCounterType)(T + 1)) {
    1169      111846 :         if (EmitWaitcnt & CNT_MASK(T)) {
    1170             :           int Delta =
    1171       40446 :               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
    1172             :           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
    1173       40446 :           if (Delta >= MaxDelta) {
    1174             :             Delta = -1;
    1175           9 :             if (T != EXP_CNT) {
    1176           9 :               ScoreBrackets->setScoreLB(
    1177             :                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
    1178             :             }
    1179           9 :             EmitWaitcnt &= ~CNT_MASK(T);
    1180             :           }
    1181       40446 :           CntVal[T] = Delta;
    1182             :         } else {
    1183             :           // If we are not waiting for a particular counter then encode
    1184             :           // it as -1 which means "don't care."
    1185       71400 :           CntVal[T] = -1;
    1186             :         }
    1187             :       }
    1188             :     }
    1189             : 
    1190             :     // If we are not waiting on any counter we can skip the wait altogether.
    1191       37286 :     if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
    1192       37279 :       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
    1193       37279 :       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
    1194        1428 :       if (!OldWaitcnt ||
    1195        1428 :           (AMDGPU::decodeVmcnt(IV, Imm) !=
    1196        1428 :                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
    1197        1422 :           (AMDGPU::decodeExpcnt(IV, Imm) !=
    1198       38701 :            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
    1199        1403 :           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
    1200        1403 :            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
    1201       35876 :         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
    1202         295 :         if (ContainingLoop) {
    1203         295 :           MachineBasicBlock *TBB = ContainingLoop->getHeader();
    1204             :           BlockWaitcntBrackets *ScoreBracket =
    1205         295 :               BlockWaitcntBracketsMap[TBB].get();
    1206         295 :           if (!ScoreBracket) {
    1207             :             assert(!BlockVisitedSet.count(TBB));
    1208             :             BlockWaitcntBracketsMap[TBB] =
    1209           0 :                 llvm::make_unique<BlockWaitcntBrackets>(ST);
    1210             :             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
    1211             :           }
    1212             :           ScoreBracket->setRevisitLoop(true);
    1213             :           LLVM_DEBUG(dbgs()
    1214             :                          << "set-revisit2: Block"
    1215             :                          << ContainingLoop->getHeader()->getNumber() << '\n';);
    1216             :         }
    1217             :       }
    1218             : 
    1219             :       // Update an existing waitcount, or make a new one.
    1220       37279 :       unsigned Enc = AMDGPU::encodeWaitcnt(IV,
    1221       37279 :                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
    1222       37279 :                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
    1223       37279 :                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
    1224             :       // We don't remove waitcnts that existed prior to the waitcnt
    1225             :       // pass. Check if the waitcnt to-be-inserted can be avoided
    1226             :       // or if the prev waitcnt can be updated.
    1227             :       bool insertSWaitInst = true;
    1228       37279 :       for (MachineBasicBlock::iterator I = MI.getIterator(),
    1229       37279 :                                        B = MI.getParent()->begin();
    1230       74301 :            insertSWaitInst && I != B; --I) {
    1231       71643 :         if (I == MI.getIterator())
    1232             :           continue;
    1233             : 
    1234       69242 :         switch (I->getOpcode()) {
    1235             :         case AMDGPU::S_WAITCNT:
    1236        1160 :           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
    1237             :             insertSWaitInst = false;
    1238           1 :           else if (!OldWaitcnt) {
    1239             :             OldWaitcnt = &*I;
    1240           0 :             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
    1241             :           }
    1242             :           break;
    1243             :         // TODO: skip over instructions which never require wait.
    1244             :         }
    1245             :         break;
    1246             :       }
    1247       37279 :       if (insertSWaitInst) {
    1248       36120 :         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
    1249             :           if (ForceEmitZeroWaitcnts)
    1250             :             LLVM_DEBUG(
    1251             :                 dbgs()
    1252             :                 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
    1253             :           if (IsForceEmitWaitcnt)
    1254             :             LLVM_DEBUG(dbgs()
    1255             :                        << "Force emit a s_waitcnt due to debug counter\n");
    1256             : 
    1257         269 :           OldWaitcnt->getOperand(0).setImm(Enc);
    1258         269 :           if (!OldWaitcnt->getParent())
    1259         268 :             MI.getParent()->insert(MI, OldWaitcnt);
    1260             : 
    1261             :           LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
    1262             :                             << "Old Instr: " << MI << '\n'
    1263             :                             << "New Instr: " << *OldWaitcnt << '\n');
    1264             :         } else {
    1265       35851 :             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
    1266       71702 :                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1267       35851 :                              .addImm(Enc);
    1268       35851 :             TrackedWaitcntSet.insert(SWaitInst);
    1269             : 
    1270             :             LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
    1271             :                               << "Old Instr: " << MI << '\n'
    1272             :                               << "New Instr: " << *SWaitInst << '\n');
    1273             :         }
    1274             :       }
    1275             : 
    1276       37279 :       if (CntVal[EXP_CNT] == 0) {
    1277             :         ScoreBrackets->setMixedExpTypes(false);
    1278             :       }
    1279             :     }
    1280             :   }
    1281             : }
    1282             : 
    1283           0 : void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
    1284             :                                              MachineInstr *Waitcnt) {
    1285           0 :   if (MBB.empty()) {
    1286             :     MBB.push_back(Waitcnt);
    1287           0 :     return;
    1288             :   }
    1289             : 
    1290           0 :   MachineBasicBlock::iterator It = MBB.end();
    1291             :   MachineInstr *MI = &*(--It);
    1292           0 :   if (MI->isBranch()) {
    1293             :     MBB.insert(It, Waitcnt);
    1294             :   } else {
    1295             :     MBB.push_back(Waitcnt);
    1296             :   }
    1297             : }
    1298             : 
    1299             : // This is a flat memory operation. Check to see if it has memory
    1300             : // tokens for both LDS and Memory, and if so mark it as a flat.
    1301           0 : bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
    1302           0 :   if (MI.memoperands_empty())
    1303           0 :     return true;
    1304             : 
    1305           0 :   for (const MachineMemOperand *Memop : MI.memoperands()) {
    1306             :     unsigned AS = Memop->getAddrSpace();
    1307           0 :     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
    1308           0 :       return true;
    1309             :   }
    1310             : 
    1311             :   return false;
    1312             : }
    1313             : 
    1314      326549 : void SIInsertWaitcnts::updateEventWaitcntAfter(
    1315             :     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
    1316             :   // Now look at the instruction opcode. If it is a memory access
    1317             :   // instruction, update the upper-bound of the appropriate counter's
    1318             :   // bracket and the destination operand scores.
    1319             :   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
    1320      326549 :   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    1321        8603 :     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
    1322           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
    1323           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    1324             :     } else {
    1325        8603 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1326             :     }
    1327      317946 :   } else if (TII->isFLAT(Inst)) {
    1328             :     assert(Inst.mayLoad() || Inst.mayStore());
    1329             : 
    1330       13559 :     if (TII->usesVM_CNT(Inst))
    1331       13559 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1332             : 
    1333       13559 :     if (TII->usesLGKM_CNT(Inst)) {
    1334       10488 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1335             : 
    1336             :       // This is a flat memory operation, so note it - it will require
    1337             :       // that both the VM and LGKM be flushed to zero if it is pending when
    1338             :       // a VM or LGKM dependency occurs.
    1339       10488 :       if (mayAccessLDSThroughFlat(Inst))
    1340             :         ScoreBrackets->setPendingFlat();
    1341             :     }
    1342       39123 :   } else if (SIInstrInfo::isVMEM(Inst) &&
    1343             :              // TODO: get a better carve out.
    1344       38995 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
    1345       38994 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
    1346             :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
    1347       37949 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1348       92956 :     if (ST->vmemWriteNeedsExpWaitcnt() &&
    1349       30894 :         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
    1350       10140 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    1351             :     }
    1352      266438 :   } else if (TII->isSMRD(Inst)) {
    1353       27083 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1354             :   } else {
    1355      239355 :     switch (Inst.getOpcode()) {
    1356          28 :     case AMDGPU::S_SENDMSG:
    1357             :     case AMDGPU::S_SENDMSGHALT:
    1358          28 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
    1359          28 :       break;
    1360         501 :     case AMDGPU::EXP:
    1361             :     case AMDGPU::EXP_DONE: {
    1362         501 :       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    1363         501 :       if (Imm >= 32 && Imm <= 63)
    1364         120 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    1365         381 :       else if (Imm >= 12 && Imm <= 15)
    1366          37 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    1367             :       else
    1368         344 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
    1369             :       break;
    1370             :     }
    1371           0 :     case AMDGPU::S_MEMTIME:
    1372             :     case AMDGPU::S_MEMREALTIME:
    1373           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1374           0 :       break;
    1375             :     default:
    1376             :       break;
    1377             :     }
    1378             :   }
    1379      326549 : }
    1380             : 
    1381             : // Merge the score brackets of the Block's predecessors;
    1382             : // this merged score bracket is used when adding waitcnts to the Block
    1383       22641 : void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
    1384       22641 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1385       22641 :   int32_t MaxPending[NUM_INST_CNTS] = {0};
    1386       22641 :   int32_t MaxFlat[NUM_INST_CNTS] = {0};
    1387             :   bool MixedExpTypes = false;
    1388             : 
    1389             :   // For single basic block loops, we need to retain the Block's
    1390             :   // score bracket to have accurate Pred info. So, make a copy of Block's
    1391             :   // score bracket, clear() it (which retains several important bits of info),
    1392             :   // populate, and then replace en masse. For non-single basic block loops,
    1393             :   // just clear Block's current score bracket and repopulate in-place.
    1394             :   bool IsSelfPred;
    1395             :   std::unique_ptr<BlockWaitcntBrackets> S;
    1396             : 
    1397       45282 :   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
    1398             :     != Block.pred_end();
    1399       22641 :   if (IsSelfPred) {
    1400         344 :     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1401             :     ScoreBrackets = S.get();
    1402             :   }
    1403             : 
    1404       22641 :   ScoreBrackets->clear();
    1405             : 
    1406             :   // See if there are any uninitialized predecessors. If so, emit an
    1407             :   // s_waitcnt 0 at the beginning of the block.
    1408       27015 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1409             :     BlockWaitcntBrackets *PredScoreBrackets =
    1410             :         BlockWaitcntBracketsMap[Pred].get();
    1411        4374 :     bool Visited = BlockVisitedSet.count(Pred);
    1412        4282 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1413          92 :       continue;
    1414             :     }
    1415       17128 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1416       12846 :          T = (enum InstCounterType)(T + 1)) {
    1417             :       int span =
    1418       12846 :           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
    1419       12846 :       MaxPending[T] = std::max(MaxPending[T], span);
    1420       12846 :       span =
    1421       12846 :           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
    1422       12883 :       MaxFlat[T] = std::max(MaxFlat[T], span);
    1423             :     }
    1424             : 
    1425        4282 :     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
    1426             :   }
    1427             : 
    1428             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1429             :   // Also handle kills for exit block.
    1430       22641 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1431        1678 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1432        3020 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1433        2265 :            T = (enum InstCounterType)(T + 1)) {
    1434        2265 :         int Span = KillWaitBrackets[I]->getScoreUB(T) -
    1435        2265 :                    KillWaitBrackets[I]->getScoreLB(T);
    1436        2265 :         MaxPending[T] = std::max(MaxPending[T], Span);
    1437        2265 :         Span = KillWaitBrackets[I]->pendingFlat(T) -
    1438             :                KillWaitBrackets[I]->getScoreLB(T);
    1439        2265 :         MaxFlat[T] = std::max(MaxFlat[T], Span);
    1440             :       }
    1441             : 
    1442         755 :       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
    1443             :     }
    1444             :   }
    1445             : 
    1446             :   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
    1447       27015 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1448             :     BlockWaitcntBrackets *PredScoreBrackets =
    1449             :         BlockWaitcntBracketsMap[Pred].get();
    1450        4374 :     bool Visited = BlockVisitedSet.count(Pred);
    1451        4282 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1452          92 :       continue;
    1453             :     }
    1454             : 
    1455        4282 :     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
    1456        4282 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1457        4282 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1458        4282 :     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
    1459        4282 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1460        4282 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1461             :   }
    1462             : 
    1463             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1464       22641 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1465        1678 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1466         755 :       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
    1467         755 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1468         755 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1469         755 :       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
    1470         755 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1471         755 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1472             :     }
    1473             :   }
    1474             : 
    1475             : #if 0
    1476             :   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
    1477             :   // TODO: how does LC distinguish between function entry and main entry?
    1478             :   // If this is the entry to a function, force a wait.
    1479             :   MachineBasicBlock &Entry = Block.getParent()->front();
    1480             :   if (Entry.getNumber() == Block.getNumber()) {
    1481             :     ScoreBrackets->setWaitAtBeginning();
    1482             :     return;
    1483             :   }
    1484             : #endif
    1485             : 
    1486             :   // Now set the current Block's brackets to the largest ending bracket.
    1487       90564 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1488       67923 :        T = (enum InstCounterType)(T + 1)) {
    1489       67923 :     ScoreBrackets->setScoreUB(T, MaxPending[T]);
    1490             :     ScoreBrackets->setScoreLB(T, 0);
    1491       67923 :     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
    1492             :   }
    1493             : 
    1494             :   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
    1495             : 
    1496             :   // Set the register scoreboard.
    1497       27015 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1498             :     if (!BlockVisitedSet.count(Pred)) {
    1499          92 :       continue;
    1500             :     }
    1501             : 
    1502             :     BlockWaitcntBrackets *PredScoreBrackets =
    1503             :         BlockWaitcntBracketsMap[Pred].get();
    1504             : 
    1505             :     // Now merge the gpr_reg_score information
    1506       17128 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1507       12846 :          T = (enum InstCounterType)(T + 1)) {
    1508             :       int PredLB = PredScoreBrackets->getScoreLB(T);
    1509             :       int PredUB = PredScoreBrackets->getScoreUB(T);
    1510       12846 :       if (PredLB < PredUB) {
    1511        2521 :         int PredScale = MaxPending[T] - PredUB;
    1512             :         // Merge vgpr scores.
    1513       87275 :         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
    1514             :           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
    1515       84754 :           if (PredRegScore <= PredLB)
    1516       78968 :             continue;
    1517        5786 :           int NewRegScore = PredScale + PredRegScore;
    1518        5786 :           ScoreBrackets->setRegScore(
    1519       11572 :               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1520             :         }
    1521             :         // Also need to merge sgpr scores for lgkm_cnt.
    1522        2521 :         if (T == LGKM_CNT) {
    1523        6411 :           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
    1524             :             int PredRegScore =
    1525        5389 :                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1526        5389 :             if (PredRegScore <= PredLB)
    1527        3296 :               continue;
    1528        2093 :             int NewRegScore = PredScale + PredRegScore;
    1529        2093 :             ScoreBrackets->setRegScore(
    1530             :                 J + NUM_ALL_VGPRS, LGKM_CNT,
    1531             :                 std::max(
    1532        4186 :                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1533             :                     NewRegScore));
    1534             :           }
    1535             :         }
    1536             :       }
    1537             :     }
    1538             : 
    1539             :     // Also merge the WaitEvent information.
    1540       47102 :     ForAllWaitEventType(W) {
    1541             :       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
    1542             :       int PredEventUB = PredScoreBrackets->getEventUB(W);
    1543       42820 :       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
    1544             :         int NewEventUB =
    1545        2540 :             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
    1546        2540 :         if (NewEventUB > 0) {
    1547        2540 :           ScoreBrackets->setEventUB(
    1548        4710 :               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1549             :         }
    1550             :       }
    1551             :     }
    1552             :   }
    1553             : 
    1554             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1555             :   // Set the register scoreboard.
    1556       22641 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1557         923 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1558             :       // Now merge the gpr_reg_score information.
    1559        3020 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1560        2265 :            T = (enum InstCounterType)(T + 1)) {
    1561        2265 :         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
    1562             :         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
    1563        2265 :         if (PredLB < PredUB) {
    1564          22 :           int PredScale = MaxPending[T] - PredUB;
    1565             :           // Merge vgpr scores.
    1566          66 :           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
    1567             :             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
    1568          22 :             if (PredRegScore <= PredLB)
    1569          22 :               continue;
    1570           0 :             int NewRegScore = PredScale + PredRegScore;
    1571           0 :             ScoreBrackets->setRegScore(
    1572           0 :                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1573             :           }
    1574             :           // Also need to merge sgpr scores for lgkm_cnt.
    1575          22 :           if (T == LGKM_CNT) {
    1576         146 :             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
    1577             :               int PredRegScore =
    1578         102 :                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1579         102 :               if (PredRegScore <= PredLB)
    1580          56 :                 continue;
    1581          46 :               int NewRegScore = PredScale + PredRegScore;
    1582          46 :               ScoreBrackets->setRegScore(
    1583             :                   J + NUM_ALL_VGPRS, LGKM_CNT,
    1584             :                   std::max(
    1585          92 :                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1586             :                       NewRegScore));
    1587             :             }
    1588             :           }
    1589             :         }
    1590             :       }
    1591             : 
    1592             :       // Also merge the WaitEvent information.
    1593        8305 :       ForAllWaitEventType(W) {
    1594        7550 :         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
    1595             :         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
    1596        7550 :         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
    1597             :           int NewEventUB =
    1598          22 :               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
    1599          22 :           if (NewEventUB > 0) {
    1600          22 :             ScoreBrackets->setEventUB(
    1601          36 :                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1602             :           }
    1603             :         }
    1604             :       }
    1605             :     }
    1606             :   }
    1607             : 
    1608             :   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
    1609             :   // sequencing predecessors, because changes to EXEC require waitcnts due to
    1610             :   // the delayed nature of these operations.
    1611       27015 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1612             :     if (!BlockVisitedSet.count(Pred)) {
    1613          92 :       continue;
    1614             :     }
    1615             : 
    1616             :     BlockWaitcntBrackets *PredScoreBrackets =
    1617             :         BlockWaitcntBracketsMap[Pred].get();
    1618             : 
    1619             :     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
    1620        4282 :     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1621           0 :       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
    1622           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1623           0 :       if (new_gds_ub > 0) {
    1624           0 :         ScoreBrackets->setEventUB(
    1625             :             GDS_GPR_LOCK,
    1626           0 :             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
    1627             :       }
    1628             :     }
    1629             :     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
    1630        4282 :     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1631          20 :       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
    1632          10 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1633          10 :       if (new_exp_ub > 0) {
    1634          10 :         ScoreBrackets->setEventUB(
    1635             :             EXP_GPR_LOCK,
    1636          10 :             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
    1637             :       }
    1638             :     }
    1639             :   }
    1640             : 
    1641             :   // if a single block loop, update the score brackets. Not needed for other
    1642             :   // blocks, as we did this in-place
    1643       22641 :   if (IsSelfPred) {
    1644         688 :     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1645             :   }
    1646       22641 : }
    1647             : 
    1648             : /// Return true if the given basic block is a "bottom" block of a loop.
    1649             : /// This works even if the loop is discontiguous. This also handles
    1650             : /// multiple back-edges for the same "header" block of a loop.
    1651           0 : bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
    1652             :                                     const MachineBasicBlock *Block) {
    1653           0 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1654           0 :     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
    1655           0 :       return true;
    1656             :     }
    1657             :   }
    1658             :   return false;
    1659             : }
    1660             : 
    1661             : /// Count the number of "bottom" basic blocks of a loop.
    1662           0 : unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
    1663             :   unsigned Count = 0;
    1664           0 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1665           0 :     if (MBB->isSuccessor(Loop->getHeader())) {
    1666           0 :       Count++;
    1667             :     }
    1668             :   }
    1669           0 :   return Count;
    1670             : }
    1671             : 
    1672             : // Generate s_waitcnt instructions where needed.
    1673           0 : void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    1674             :                                             MachineBasicBlock &Block) {
    1675             :   // Initialize the state information.
    1676           0 :   mergeInputScoreBrackets(Block);
    1677             : 
    1678           0 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1679             : 
    1680             :   LLVM_DEBUG({
    1681             :     dbgs() << "*** Block" << Block.getNumber() << " ***";
    1682             :     ScoreBrackets->dump();
    1683             :   });
    1684             : 
    1685             :   // Walk over the instructions.
    1686             :   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
    1687           0 :        Iter != E;) {
    1688             :     MachineInstr &Inst = *Iter;
    1689             :     // Remove any previously existing waitcnts.
    1690           0 :     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
    1691             :       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
    1692             :       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
    1693             :       // as needed.
    1694             :       if (!TrackedWaitcntSet.count(&Inst))
    1695             :         ++Iter;
    1696             :       else {
    1697             :         ++Iter;
    1698           0 :         Inst.removeFromParent();
    1699             :       }
    1700             :       ScoreBrackets->setWaitcnt(&Inst);
    1701           0 :       continue;
    1702             :     }
    1703             : 
    1704             :     // Kill instructions generate a conditional branch to the endmain block.
    1705             :     // Merge the current waitcnt state into the endmain block information.
    1706             :     // TODO: Are there other flavors of KILL instruction?
    1707           0 :     if (Inst.getOpcode() == AMDGPU::KILL) {
    1708           0 :       addKillWaitBracket(ScoreBrackets);
    1709             :     }
    1710             : 
    1711             :     bool VCCZBugWorkAround = false;
    1712             :     if (readsVCCZ(Inst) &&
    1713             :         (!VCCZBugHandledSet.count(&Inst))) {
    1714             :       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1715           0 :               ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1716             :           ScoreBrackets->hasPendingSMEM()) {
    1717           0 :         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
    1718             :           VCCZBugWorkAround = true;
    1719             :       }
    1720             :     }
    1721             : 
    1722             :     // Generate an s_waitcnt instruction to be placed before
    1723             :     // cur_Inst, if needed.
    1724           0 :     generateWaitcntInstBefore(Inst, ScoreBrackets);
    1725             : 
    1726           0 :     updateEventWaitcntAfter(Inst, ScoreBrackets);
    1727             : 
    1728             : #if 0 // TODO: implement resource type check controlled by options with ub = LB.
    1729             :     // If this instruction generates a S_SETVSKIP because it is an
    1730             :     // indexed resource, and we are on Tahiti, then it will also force
    1731             :     // an S_WAITCNT vmcnt(0)
    1732             :     if (RequireCheckResourceType(Inst, context)) {
    1733             :       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
    1734             :       ScoreBrackets->setScoreLB(VM_CNT,
    1735             :       ScoreBrackets->getScoreUB(VM_CNT));
    1736             :     }
    1737             : #endif
    1738             : 
    1739             :     ScoreBrackets->clearWaitcnt();
    1740             : 
    1741             :     LLVM_DEBUG({
    1742             :       Inst.print(dbgs());
    1743             :       ScoreBrackets->dump();
    1744             :     });
    1745             : 
    1746             :     // Check to see if this is a GWS instruction. If so, and if this is CI or
    1747             :     // VI, then the generated code sequence will include an S_WAITCNT 0.
    1748             :     // TODO: Are these the only GWS instructions?
    1749           0 :     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
    1750           0 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
    1751           0 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
    1752           0 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
    1753             :         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
    1754             :       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
    1755           0 :       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1756           0 :       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1757           0 :       ScoreBrackets->updateByWait(LGKM_CNT,
    1758             :                                   ScoreBrackets->getScoreUB(LGKM_CNT));
    1759             :     }
    1760             : 
    1761             :     // TODO: Remove this work-around after fixing the scheduler and enable the
    1762             :     // assert above.
    1763           0 :     if (VCCZBugWorkAround) {
    1764             :       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    1765             :       // bit is updated, so we can restore the bit by reading the value of
    1766             :       // vcc and then writing it back to the register.
    1767           0 :       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    1768           0 :               AMDGPU::VCC)
    1769           0 :           .addReg(AMDGPU::VCC);
    1770           0 :       VCCZBugHandledSet.insert(&Inst);
    1771             :     }
    1772             : 
    1773             :     ++Iter;
    1774             :   }
    1775             : 
    1776             :   // Check if we need to force convergence at loop footer.
    1777           0 :   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
    1778           0 :   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
    1779           0 :     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1780             :     WaitcntData->print();
    1781             :     LLVM_DEBUG(dbgs() << '\n';);
    1782             : 
    1783             :     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
    1784             :     // placement, but doesn't guarantee convergence for a loop. Each
    1785             :     // loop should take at most (n+1) iterations for it to converge naturally,
    1786             :     // where n is the number of bottom blocks. If this threshold is reached and
    1787             :     // the result hasn't converged, then we force convergence by inserting
    1788             :     // a s_waitcnt at the end of loop footer.
    1789           0 :     if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
    1790             :       // To ensure convergence, need to make wait events at loop footer be no
    1791             :       // more than those from the previous iteration.
    1792             :       // As a simplification, instead of tracking individual scores and
    1793             :       // generating the precise wait count, just wait on 0.
    1794             :       bool HasPending = false;
    1795           0 :       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
    1796           0 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1797           0 :            T = (enum InstCounterType)(T + 1)) {
    1798           0 :         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    1799             :           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1800             :           HasPending = true;
    1801             :           break;
    1802             :         }
    1803             :       }
    1804             : 
    1805             :       if (HasPending) {
    1806           0 :         if (!SWaitInst) {
    1807           0 :           SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
    1808           0 :                               DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1809             :                               .addImm(0);
    1810           0 :           TrackedWaitcntSet.insert(SWaitInst);
    1811             : #if 0 // TODO: Format the debug output
    1812             :           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
    1813             :           OutputTransformAdd(SWaitInst, context);
    1814             : #endif
    1815             :         }
    1816             : #if 0 // TODO: ??
    1817             :         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
    1818             : #endif
    1819             :       }
    1820             : 
    1821           0 :       if (SWaitInst) {
    1822             :         LLVM_DEBUG({
    1823             :           SWaitInst->print(dbgs());
    1824             :           dbgs() << "\nAdjusted score board:";
    1825             :           ScoreBrackets->dump();
    1826             :         });
    1827             : 
    1828             :         // Add this waitcnt to the block. It is either newly created or
    1829             :         // created in previous iterations and added back since block traversal
    1830             :         // always removes waitcnts.
    1831           0 :         insertWaitcntBeforeCF(Block, SWaitInst);
    1832           0 :         WaitcntData->setWaitcnt(SWaitInst);
    1833             :       }
    1834             :     }
    1835             :   }
    1836           0 : }
    1837             : 
    1838       19750 : bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    1839       19750 :   ST = &MF.getSubtarget<GCNSubtarget>();
    1840       19750 :   TII = ST->getInstrInfo();
    1841       19750 :   TRI = &TII->getRegisterInfo();
    1842       19750 :   MRI = &MF.getRegInfo();
    1843       19750 :   MLI = &getAnalysis<MachineLoopInfo>();
    1844       39500 :   IV = AMDGPU::getIsaVersion(ST->getCPU());
    1845       19750 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1846             : 
    1847       19750 :   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
    1848       79000 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1849       59250 :        T = (enum InstCounterType)(T + 1))
    1850       59250 :     ForceEmitWaitcnt[T] = false;
    1851             : 
    1852       19750 :   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
    1853       19750 :   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
    1854       19750 :   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
    1855             : 
    1856       19750 :   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
    1857       19750 :   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
    1858             :   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
    1859             :   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
    1860             : 
    1861       19750 :   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
    1862       19750 :   RegisterEncoding.VGPRL =
    1863       19750 :       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
    1864       19750 :   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
    1865       19750 :   RegisterEncoding.SGPRL =
    1866       19750 :       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
    1867             : 
    1868             :   TrackedWaitcntSet.clear();
    1869             :   BlockVisitedSet.clear();
    1870             :   VCCZBugHandledSet.clear();
    1871       19750 :   LoopWaitcntDataMap.clear();
    1872       19750 :   BlockWaitcntProcessedSet.clear();
    1873             : 
    1874             :   // Walk over the blocks in reverse post-dominator order, inserting
    1875             :   // s_waitcnt where needed.
    1876             :   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    1877             :   bool Modified = false;
    1878             :   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1879             :            I = RPOT.begin(),
    1880             :            E = RPOT.end(), J = RPOT.begin();
    1881       42391 :        I != E;) {
    1882       22641 :     MachineBasicBlock &MBB = **I;
    1883             : 
    1884       22641 :     BlockVisitedSet.insert(&MBB);
    1885             : 
    1886       22641 :     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1887       22641 :     if (!ScoreBrackets) {
    1888       16138 :       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
    1889        8069 :       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1890             :     }
    1891       22641 :     ScoreBrackets->setPostOrder(MBB.getNumber());
    1892       22641 :     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
    1893       23498 :     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
    1894             :       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
    1895             : 
    1896             :     // If we are walking into the block from before the loop, then guarantee
    1897             :     // at least 1 re-walk over the loop to propagate the information, even if
    1898             :     // no S_WAITCNT instructions were generated.
    1899       22641 :     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
    1900         502 :       unsigned Count = countNumBottomBlocks(ContainingLoop);
    1901             : 
    1902             :       // If the loop has multiple back-edges, and so more than one "bottom"
    1903             :       // basic block, we have to guarantee a re-walk over every blocks.
    1904         502 :       if ((std::count(BlockWaitcntProcessedSet.begin(),
    1905         502 :                       BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
    1906         248 :         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
    1907             :         LLVM_DEBUG(dbgs() << "set-revisit1: Block"
    1908             :                           << ContainingLoop->getHeader()->getNumber() << '\n';);
    1909             :       }
    1910             :     }
    1911             : 
    1912             :     // Walk over the instructions.
    1913       22641 :     insertWaitcntInBlock(MF, MBB);
    1914             : 
    1915             :     // Record that waitcnts have been processed at least once for this block.
    1916       22641 :     BlockWaitcntProcessedSet.push_back(&MBB);
    1917             : 
    1918             :     // See if we want to revisit the loop. If a loop has multiple back-edges,
    1919             :     // we shouldn't revisit the same "bottom" basic block.
    1920       23160 :     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
    1921             :         std::count(BlockWaitcntProcessedSet.begin(),
    1922             :                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
    1923         496 :       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
    1924             :       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
    1925         248 :       if (EntrySB && EntrySB->getRevisitLoop()) {
    1926             :         EntrySB->setRevisitLoop(false);
    1927             :         J = I;
    1928         248 :         int32_t PostOrder = EntrySB->getPostOrder();
    1929             :         // TODO: Avoid this loop. Find another way to set I.
    1930             :         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1931             :                  X = RPOT.begin(),
    1932             :                  Y = RPOT.end();
    1933         613 :              X != Y; ++X) {
    1934         613 :           MachineBasicBlock &MBBX = **X;
    1935         613 :           if (MBBX.getNumber() == PostOrder) {
    1936             :             I = X;
    1937             :             break;
    1938             :           }
    1939             :         }
    1940             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1941             :         WaitcntData->incIterCnt();
    1942             :         LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
    1943         248 :         continue;
    1944             :       } else {
    1945             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1946             :         // Loop converged, reset iteration count. If this loop gets revisited,
    1947             :         // it must be from an outer loop, the counter will restart, this will
    1948             :         // ensure we don't force convergence on such revisits.
    1949             :         WaitcntData->resetIterCnt();
    1950             :       }
    1951             :     }
    1952             : 
    1953             :     J = I;
    1954             :     ++I;
    1955             :   }
    1956             : 
    1957             :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
    1958             : 
    1959             :   bool HaveScalarStores = false;
    1960             : 
    1961       41941 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
    1962             :        ++BI) {
    1963             :     MachineBasicBlock &MBB = *BI;
    1964             : 
    1965      379508 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
    1966             :          ++I) {
    1967      357317 :       if (!HaveScalarStores && TII->isScalarStore(*I))
    1968             :         HaveScalarStores = true;
    1969             : 
    1970      714634 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
    1971             :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
    1972       17989 :         EndPgmBlocks.push_back(&MBB);
    1973             :     }
    1974             :   }
    1975             : 
    1976       19750 :   if (HaveScalarStores) {
    1977             :     // If scalar writes are used, the cache must be flushed or else the next
    1978             :     // wave to reuse the same scratch memory can be clobbered.
    1979             :     //
    1980             :     // Insert s_dcache_wb at wave termination points if there were any scalar
    1981             :     // stores, and only if the cache hasn't already been flushed. This could be
    1982             :     // improved by looking across blocks for flushes in postdominating blocks
    1983             :     // from the stores but an explicitly requested flush is probably very rare.
    1984          30 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
    1985             :       bool SeenDCacheWB = false;
    1986             : 
    1987         148 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
    1988             :            ++I) {
    1989         264 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
    1990             :           SeenDCacheWB = true;
    1991         130 :         else if (TII->isScalarStore(*I))
    1992             :           SeenDCacheWB = false;
    1993             : 
    1994             :         // FIXME: It would be better to insert this before a waitcnt if any.
    1995         117 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
    1996         133 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
    1997             :             !SeenDCacheWB) {
    1998             :           Modified = true;
    1999          30 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
    2000             :         }
    2001             :       }
    2002             :     }
    2003             :   }
    2004             : 
    2005       19750 :   if (!MFI->isEntryFunction()) {
    2006             :     // Wait for any outstanding memory operations that the input registers may
    2007             :     // depend on. We can't track them and it's better to the wait after the
    2008             :     // costly call sequence.
    2009             : 
    2010             :     // TODO: Could insert earlier and schedule more liberally with operations
    2011             :     // that only use caller preserved registers.
    2012             :     MachineBasicBlock &EntryBB = MF.front();
    2013        5319 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    2014             :       .addImm(0);
    2015             : 
    2016             :     Modified = true;
    2017             :   }
    2018             : 
    2019       19750 :   return Modified;
    2020             : }

Generated by: LCOV version 1.13