LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaitcnts.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 547 637 85.9 %
Date: 2018-07-13 00:08:38 Functions: 27 29 93.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/DenseMap.h"
      27             : #include "llvm/ADT/DenseSet.h"
      28             : #include "llvm/ADT/PostOrderIterator.h"
      29             : #include "llvm/ADT/STLExtras.h"
      30             : #include "llvm/ADT/SmallVector.h"
      31             : #include "llvm/CodeGen/MachineBasicBlock.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineFunctionPass.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineLoopInfo.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineOperand.h"
      39             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      40             : #include "llvm/IR/DebugLoc.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Debug.h"
      43             : #include "llvm/Support/DebugCounter.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include "llvm/Support/raw_ostream.h"
      46             : #include <algorithm>
      47             : #include <cassert>
      48             : #include <cstdint>
      49             : #include <cstring>
      50             : #include <memory>
      51             : #include <utility>
      52             : #include <vector>
      53             : 
      54             : using namespace llvm;
      55             : 
      56             : #define DEBUG_TYPE "si-insert-waitcnts"
      57             : 
      58       99743 : DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
      59             :               "Force emit s_waitcnt expcnt(0) instrs");
      60       99743 : DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
      61             :               "Force emit s_waitcnt lgkmcnt(0) instrs");
      62       99743 : DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
      63             :               "Force emit s_waitcnt vmcnt(0) instrs");
      64             : 
      65       99743 : static cl::opt<unsigned> ForceEmitZeroFlag(
      66             :   "amdgpu-waitcnt-forcezero",
      67       99743 :   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
      68      299229 :   cl::init(0), cl::Hidden);
      69             : 
      70             : namespace {
      71             : 
      72             : // Class of object that encapsulates latest instruction counter score
      73             : // associated with the operand.  Used for determining whether
      74             : // s_waitcnt instruction needs to be emited.
      75             : 
      76             : #define CNT_MASK(t) (1u << (t))
      77             : 
      78             : enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
      79             : 
      80             : using RegInterval = std::pair<signed, signed>;
      81             : 
      82             : struct {
      83             :   int32_t VmcntMax;
      84             :   int32_t ExpcntMax;
      85             :   int32_t LgkmcntMax;
      86             :   int32_t NumVGPRsMax;
      87             :   int32_t NumSGPRsMax;
      88             : } HardwareLimits;
      89             : 
      90             : struct {
      91             :   unsigned VGPR0;
      92             :   unsigned VGPRL;
      93             :   unsigned SGPR0;
      94             :   unsigned SGPRL;
      95             : } RegisterEncoding;
      96             : 
      97             : enum WaitEventType {
      98             :   VMEM_ACCESS,      // vector-memory read & write
      99             :   LDS_ACCESS,       // lds read & write
     100             :   GDS_ACCESS,       // gds read & write
     101             :   SQ_MESSAGE,       // send message
     102             :   SMEM_ACCESS,      // scalar-memory read & write
     103             :   EXP_GPR_LOCK,     // export holding on its data src
     104             :   GDS_GPR_LOCK,     // GDS holding on its data and addr src
     105             :   EXP_POS_ACCESS,   // write to export position
     106             :   EXP_PARAM_ACCESS, // write to export parameter
     107             :   VMW_GPR_LOCK,     // vector-memory write holding on its data src
     108             :   NUM_WAIT_EVENTS,
     109             : };
     110             : 
     111             : // The mapping is:
     112             : //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
     113             : //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
     114             : //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
     115             : // We reserve a fixed number of VGPR slots in the scoring tables for
     116             : // special tokens like SCMEM_LDS (needed for buffer load to LDS).
     117             : enum RegisterMapping {
     118             :   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
     119             :   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
     120             :   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
     121             :   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
     122             :   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
     123             : };
     124             : 
     125             : #define ForAllWaitEventType(w)                                                 \
     126             :   for (enum WaitEventType w = (enum WaitEventType)0;                           \
     127             :        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
     128             :        (w) = (enum WaitEventType)((w) + 1))
     129             : 
     130             : // This is a per-basic-block object that maintains current score brackets
     131             : // of each wait counter, and a per-register scoreboard for each wait counter.
     132             : // We also maintain the latest score for every event type that can change the
     133             : // waitcnt in order to know if there are multiple types of events within
     134             : // the brackets. When multiple types of event happen in the bracket,
     135             : // wait count may get decreased out of order, therefore we need to put in
     136             : // "s_waitcnt 0" before use.
     137             : class BlockWaitcntBrackets {
     138             : public:
     139        7468 :   BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
     140       52276 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     141       22404 :          T = (enum InstCounterType)(T + 1)) {
     142       22404 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     143             :     }
     144        7468 :   }
     145             : 
     146             :   ~BlockWaitcntBrackets() = default;
     147             : 
     148             :   static int32_t getWaitCountMax(InstCounterType T) {
     149       37514 :     switch (T) {
     150       14421 :     case VM_CNT:
     151       14421 :       return HardwareLimits.VmcntMax;
     152       20952 :     case LGKM_CNT:
     153       20952 :       return HardwareLimits.LgkmcntMax;
     154        2141 :     case EXP_CNT:
     155       12821 :       return HardwareLimits.ExpcntMax;
     156             :     default:
     157             :       break;
     158             :     }
     159             :     return 0;
     160             :   }
     161             : 
     162             :   void setScoreLB(InstCounterType T, int32_t Val) {
     163             :     assert(T < NUM_INST_CNTS);
     164       38540 :     if (T >= NUM_INST_CNTS)
     165             :       return;
     166      101619 :     ScoreLBs[T] = Val;
     167             :   }
     168             : 
     169             :   void setScoreUB(InstCounterType T, int32_t Val) {
     170             :     assert(T < NUM_INST_CNTS);
     171             :     if (T >= NUM_INST_CNTS)
     172             :       return;
     173      162763 :     ScoreUBs[T] = Val;
     174      101323 :     if (T == EXP_CNT) {
     175       10680 :       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
     176       10680 :       if (ScoreLBs[T] < UB)
     177        1143 :         ScoreLBs[T] = UB;
     178             :     }
     179             :   }
     180             : 
     181             :   int32_t getScoreLB(InstCounterType T) {
     182             :     assert(T < NUM_INST_CNTS);
     183     2685809 :     if (T >= NUM_INST_CNTS)
     184             :       return 0;
     185     2766284 :     return ScoreLBs[T];
     186             :   }
     187             : 
     188             :   int32_t getScoreUB(InstCounterType T) {
     189             :     assert(T < NUM_INST_CNTS);
     190     2741262 :     if (T >= NUM_INST_CNTS)
     191             :       return 0;
     192     2816844 :     return ScoreUBs[T];
     193             :   }
     194             : 
     195             :   // Mapping from event to counter.
     196             :   InstCounterType eventCounter(WaitEventType E) {
     197      147193 :     switch (E) {
     198             :     case VMEM_ACCESS:
     199             :       return VM_CNT;
     200       60117 :     case LDS_ACCESS:
     201             :     case GDS_ACCESS:
     202             :     case SQ_MESSAGE:
     203             :     case SMEM_ACCESS:
     204             :       return LGKM_CNT;
     205       33615 :     case EXP_GPR_LOCK:
     206             :     case GDS_GPR_LOCK:
     207             :     case VMW_GPR_LOCK:
     208             :     case EXP_POS_ACCESS:
     209             :     case EXP_PARAM_ACCESS:
     210             :       return EXP_CNT;
     211           0 :     default:
     212           0 :       llvm_unreachable("unhandled event type");
     213             :     }
     214             :     return NUM_INST_CNTS;
     215             :   }
     216             : 
     217             :   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
     218      145858 :     if (GprNo < NUM_ALL_VGPRS) {
     219       84715 :       if (GprNo > VgprUB) {
     220       16588 :         VgprUB = GprNo;
     221             :       }
     222       84715 :       VgprScores[T][GprNo] = Val;
     223             :     } else {
     224             :       assert(T == LGKM_CNT);
     225       66004 :       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
     226       18718 :         SgprUB = GprNo - NUM_ALL_VGPRS;
     227             :       }
     228       66004 :       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     229             :     }
     230             :   }
     231             : 
     232             :   int32_t getRegScore(int GprNo, InstCounterType T) {
     233     2686115 :     if (GprNo < NUM_ALL_VGPRS) {
     234     1481425 :       return VgprScores[T][GprNo];
     235             :     }
     236     1229846 :     return SgprScores[GprNo - NUM_ALL_VGPRS];
     237             :   }
     238             : 
     239       20480 :   void clear() {
     240       20480 :     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     241       20480 :     memset(ScoreUBs, 0, sizeof(ScoreUBs));
     242       20480 :     memset(EventUBs, 0, sizeof(EventUBs));
     243      143360 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     244       61440 :          T = (enum InstCounterType)(T + 1)) {
     245       61440 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     246             :     }
     247       20480 :     memset(SgprScores, 0, sizeof(SgprScores));
     248       20480 :   }
     249             : 
     250             :   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
     251             :                              const MachineRegisterInfo *MRI,
     252             :                              const SIRegisterInfo *TRI, unsigned OpNo,
     253             :                              bool Def) const;
     254             : 
     255             :   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
     256             :                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
     257             :                    unsigned OpNo, int32_t Val);
     258             : 
     259             :   void setWaitAtBeginning() { WaitAtBeginning = true; }
     260           0 :   void clearWaitAtBeginning() { WaitAtBeginning = false; }
     261             :   bool getWaitAtBeginning() const { return WaitAtBeginning; }
     262        2180 :   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
     263             :   int32_t getMaxVGPR() const { return VgprUB; }
     264             :   int32_t getMaxSGPR() const { return SgprUB; }
     265             : 
     266             :   int32_t getEventUB(enum WaitEventType W) const {
     267             :     assert(W < NUM_WAIT_EVENTS);
     268       74184 :     return EventUBs[W];
     269             :   }
     270             : 
     271             :   bool counterOutOfOrder(InstCounterType T);
     272             :   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
     273             :   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
     274             :                      const MachineRegisterInfo *MRI, WaitEventType E,
     275             :                      MachineInstr &MI);
     276             : 
     277             :   bool hasPendingSMEM() const {
     278          53 :     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     279             :             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
     280             :   }
     281             : 
     282             :   bool hasPendingFlat() const {
     283       36764 :     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
     284       69814 :              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
     285       34302 :             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
     286          14 :              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
     287             :   }
     288             : 
     289             :   void setPendingFlat() {
     290        1616 :     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
     291        1616 :     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
     292             :   }
     293             : 
     294       13761 :   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
     295             : 
     296       61440 :   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
     297             : 
     298             :   bool getRevisitLoop() const { return RevisitLoop; }
     299         692 :   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
     300             : 
     301       20480 :   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
     302             :   int32_t getPostOrder() const { return PostOrder; }
     303             : 
     304        2539 :   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
     305      301054 :   void clearWaitcnt() { Waitcnt = nullptr; }
     306             :   MachineInstr *getWaitcnt() const { return Waitcnt; }
     307             : 
     308             :   bool mixedExpTypes() const { return MixedExpTypes; }
     309             :   void setMixedExpTypes(bool MixedExpTypesIn) {
     310       22475 :     MixedExpTypes = MixedExpTypesIn;
     311             :   }
     312             : 
     313             :   void print(raw_ostream &);
     314             :   void dump() { print(dbgs()); }
     315             : 
     316             : private:
     317             :   const SISubtarget *ST = nullptr;
     318             :   bool WaitAtBeginning = false;
     319             :   bool RevisitLoop = false;
     320             :   bool MixedExpTypes = false;
     321             :   int32_t PostOrder = 0;
     322             :   MachineInstr *Waitcnt = nullptr;
     323             :   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
     324             :   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
     325             :   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
     326             :   // Remember the last flat memory operation.
     327             :   int32_t LastFlat[NUM_INST_CNTS] = {0};
     328             :   // wait_cnt scores for every vgpr.
     329             :   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
     330             :   int32_t VgprUB = 0;
     331             :   int32_t SgprUB = 0;
     332             :   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
     333             :   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
     334             :   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
     335             : };
     336             : 
     337             : // This is a per-loop-region object that records waitcnt status at the end of
     338             : // loop footer from the previous iteration. We also maintain an iteration
     339             : // count to track the number of times the loop has been visited. When it
     340             : // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
     341             : // at the end of the loop footer.
     342             : class LoopWaitcntData {
     343             : public:
     344             :   LoopWaitcntData() = default;
     345             :   ~LoopWaitcntData() = default;
     346             : 
     347         228 :   void incIterCnt() { IterCnt++; }
     348           0 :   void resetIterCnt() { IterCnt = 0; }
     349         487 :   unsigned getIterCnt() { return IterCnt; }
     350             : 
     351           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
     352             :   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
     353             : 
     354             :   void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
     355             : 
     356             : private:
     357             :   // s_waitcnt added at the end of loop footer to stablize wait scores
     358             :   // at the end of the loop footer.
     359             :   MachineInstr *LfWaitcnt = nullptr;
     360             :   // Number of iterations the loop has been visited, not including the initial
     361             :   // walk over.
     362             :   int32_t IterCnt = 0;
     363             : };
     364             : 
     365        7188 : class SIInsertWaitcnts : public MachineFunctionPass {
     366             : private:
     367             :   const SISubtarget *ST = nullptr;
     368             :   const SIInstrInfo *TII = nullptr;
     369             :   const SIRegisterInfo *TRI = nullptr;
     370             :   const MachineRegisterInfo *MRI = nullptr;
     371             :   const MachineLoopInfo *MLI = nullptr;
     372             :   AMDGPU::IsaInfo::IsaVersion IV;
     373             :   AMDGPUAS AMDGPUASI;
     374             : 
     375             :   DenseSet<MachineBasicBlock *> BlockVisitedSet;
     376             :   DenseSet<MachineInstr *> TrackedWaitcntSet;
     377             :   DenseSet<MachineInstr *> VCCZBugHandledSet;
     378             : 
     379             :   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
     380             :       BlockWaitcntBracketsMap;
     381             : 
     382             :   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
     383             : 
     384             :   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
     385             : 
     386             :   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
     387             : 
     388             :   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
     389             :   // because of amdgpu-waitcnt-forcezero flag
     390             :   bool ForceEmitZeroWaitcnts;
     391             :   bool ForceEmitWaitcnt[NUM_INST_CNTS];
     392             : 
     393             : public:
     394             :   static char ID;
     395             : 
     396        5415 :   SIInsertWaitcnts() : MachineFunctionPass(ID) {
     397             :     (void)ForceExpCounter;
     398             :     (void)ForceLgkmCounter;
     399             :     (void)ForceVMCounter;
     400        1805 :   }
     401             : 
     402             :   bool runOnMachineFunction(MachineFunction &MF) override;
     403             : 
     404        1793 :   StringRef getPassName() const override {
     405        1793 :     return "SI insert wait instructions";
     406             :   }
     407             : 
     408        1793 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     409        1793 :     AU.setPreservesCFG();
     410             :     AU.addRequired<MachineLoopInfo>();
     411        1793 :     MachineFunctionPass::getAnalysisUsage(AU);
     412        1793 :   }
     413             : 
     414          75 :   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     415             :     // The waitcnt information is copied because it changes as the block is
     416             :     // traversed.
     417          75 :     KillWaitBrackets.push_back(
     418         150 :         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
     419          75 :   }
     420             : 
     421             :   bool isForceEmitWaitcnt() const {
     422     1204216 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     423      903162 :          T = (enum InstCounterType)(T + 1))
     424      903162 :       if (ForceEmitWaitcnt[T])
     425             :         return true;
     426             :     return false;
     427             :   }
     428             : 
     429             :   void setForceEmitWaitcnt() {
     430             : // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
     431             : // For debug builds, get the debug counter info and adjust if need be
     432             : #ifndef NDEBUG
     433             :     if (DebugCounter::isCounterSet(ForceExpCounter) &&
     434             :         DebugCounter::shouldExecute(ForceExpCounter)) {
     435             :       ForceEmitWaitcnt[EXP_CNT] = true;
     436             :     } else {
     437             :       ForceEmitWaitcnt[EXP_CNT] = false;
     438             :     }
     439             : 
     440             :     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
     441             :          DebugCounter::shouldExecute(ForceLgkmCounter)) {
     442             :       ForceEmitWaitcnt[LGKM_CNT] = true;
     443             :     } else {
     444             :       ForceEmitWaitcnt[LGKM_CNT] = false;
     445             :     }
     446             : 
     447             :     if (DebugCounter::isCounterSet(ForceVMCounter) &&
     448             :         DebugCounter::shouldExecute(ForceVMCounter)) {
     449             :       ForceEmitWaitcnt[VM_CNT] = true;
     450             :     } else {
     451             :       ForceEmitWaitcnt[VM_CNT] = false;
     452             :     }
     453             : #endif // NDEBUG
     454             :   }
     455             : 
     456             :   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
     457             :   void generateWaitcntInstBefore(MachineInstr &MI,
     458             :                                   BlockWaitcntBrackets *ScoreBrackets);
     459             :   void updateEventWaitcntAfter(MachineInstr &Inst,
     460             :                                BlockWaitcntBrackets *ScoreBrackets);
     461             :   void mergeInputScoreBrackets(MachineBasicBlock &Block);
     462             :   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
     463             :   unsigned countNumBottomBlocks(const MachineLoop *Loop);
     464             :   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
     465             :   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
     466             :   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
     467             :   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
     468             : };
     469             : 
     470             : } // end anonymous namespace
     471             : 
     472     3233930 : RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
     473             :                                                  const SIInstrInfo *TII,
     474             :                                                  const MachineRegisterInfo *MRI,
     475             :                                                  const SIRegisterInfo *TRI,
     476             :                                                  unsigned OpNo,
     477             :                                                  bool Def) const {
     478     3233930 :   const MachineOperand &Op = MI->getOperand(OpNo);
     479     5333275 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
     480     1171448 :       (Def && !Op.isDef()))
     481     2030449 :     return {-1, -1};
     482             : 
     483             :   // A use via a PW operand does not need a waitcnt.
     484             :   // A partial write is not a WAW.
     485             :   assert(!Op.getSubReg() || !Op.isUndef());
     486             : 
     487             :   RegInterval Result;
     488             :   const MachineRegisterInfo &MRIA = *MRI;
     489             : 
     490     2406962 :   unsigned Reg = TRI->getEncodingValue(Op.getReg());
     491             : 
     492     1203481 :   if (TRI->isVGPR(MRIA, Op.getReg())) {
     493             :     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     494      514187 :     Result.first = Reg - RegisterEncoding.VGPR0;
     495             :     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     496      689294 :   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
     497             :     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     498      689294 :     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     499             :     assert(Result.first >= NUM_ALL_VGPRS &&
     500             :            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
     501             :   }
     502             :   // TODO: Handle TTMP
     503             :   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
     504             :   else
     505           0 :     return {-1, -1};
     506             : 
     507             :   const MachineInstr &MIA = *MI;
     508     1203481 :   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
     509             :   unsigned Size = TRI->getRegSizeInBits(*RC);
     510     1203481 :   Result.second = Result.first + (Size / 32);
     511             : 
     512     1203481 :   return Result;
     513             : }
     514             : 
     515       12015 : void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
     516             :                                        const SIInstrInfo *TII,
     517             :                                        const SIRegisterInfo *TRI,
     518             :                                        const MachineRegisterInfo *MRI,
     519             :                                        unsigned OpNo, int32_t Val) {
     520       12015 :   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
     521             :   LLVM_DEBUG({
     522             :     const MachineOperand &Opnd = MI->getOperand(OpNo);
     523             :     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
     524             :   });
     525       32039 :   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     526             :     setRegScore(RegNo, EXP_CNT, Val);
     527             :   }
     528       12015 : }
     529             : 
     530      101323 : void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     531             :                                          const SIRegisterInfo *TRI,
     532             :                                          const MachineRegisterInfo *MRI,
     533             :                                          WaitEventType E, MachineInstr &Inst) {
     534             :   const MachineRegisterInfo &MRIA = *MRI;
     535             :   InstCounterType T = eventCounter(E);
     536      101323 :   int32_t CurrScore = getScoreUB(T) + 1;
     537             :   // EventUB and ScoreUB need to be update regardless if this event changes
     538             :   // the score of a register or not.
     539             :   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
     540      101323 :   EventUBs[E] = CurrScore;
     541             :   setScoreUB(T, CurrScore);
     542             : 
     543      101323 :   if (T == EXP_CNT) {
     544             :     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
     545             :     // is required.
     546       10680 :     if (!MixedExpTypes) {
     547       10680 :       MixedExpTypes = counterOutOfOrder(EXP_CNT);
     548             :     }
     549             : 
     550             :     // Put score on the source vgprs. If this is a store, just use those
     551             :     // specific register(s).
     552       10680 :     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
     553             :       // All GDS operations must protect their address register (same as
     554             :       // export.)
     555           0 :       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
     556             :           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
     557           0 :         setExpScore(
     558             :             &Inst, TII, TRI, MRI,
     559           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
     560             :             CurrScore);
     561             :       }
     562           0 :       if (Inst.mayStore()) {
     563           0 :         setExpScore(
     564             :             &Inst, TII, TRI, MRI,
     565           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
     566             :             CurrScore);
     567           0 :         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     568             :                                        AMDGPU::OpName::data1) != -1) {
     569           0 :           setExpScore(&Inst, TII, TRI, MRI,
     570             :                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     571             :                                                  AMDGPU::OpName::data1),
     572             :                       CurrScore);
     573             :         }
     574           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
     575           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
     576           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
     577           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
     578           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
     579           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
     580           0 :                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
     581           0 :                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
     582             :                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
     583           0 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     584           0 :           const MachineOperand &Op = Inst.getOperand(I);
     585           0 :           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
     586           0 :             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     587             :           }
     588             :         }
     589             :       }
     590       10680 :     } else if (TII->isFLAT(Inst)) {
     591           0 :       if (Inst.mayStore()) {
     592           0 :         setExpScore(
     593             :             &Inst, TII, TRI, MRI,
     594           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     595             :             CurrScore);
     596           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     597           0 :         setExpScore(
     598             :             &Inst, TII, TRI, MRI,
     599           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     600             :             CurrScore);
     601             :       }
     602       10680 :     } else if (TII->isMIMG(Inst)) {
     603          47 :       if (Inst.mayStore()) {
     604          47 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     605           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     606           0 :         setExpScore(
     607             :             &Inst, TII, TRI, MRI,
     608           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     609             :             CurrScore);
     610             :       }
     611       10633 :     } else if (TII->isMTBUF(Inst)) {
     612          24 :       if (Inst.mayStore()) {
     613          24 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     614             :       }
     615       10609 :     } else if (TII->isMUBUF(Inst)) {
     616       10164 :       if (Inst.mayStore()) {
     617       10164 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     618           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     619           0 :         setExpScore(
     620             :             &Inst, TII, TRI, MRI,
     621           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     622             :             CurrScore);
     623             :       }
     624             :     } else {
     625         445 :       if (TII->isEXP(Inst)) {
     626             :         // For export the destination registers are really temps that
     627             :         // can be used as the actual source after export patching, so
     628             :         // we need to treat them like sources and set the EXP_CNT
     629             :         // score.
     630        4450 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     631        4005 :           MachineOperand &DefMO = Inst.getOperand(I);
     632        6230 :           if (DefMO.isReg() && DefMO.isDef() &&
     633           0 :               TRI->isVGPR(MRIA, DefMO.getReg())) {
     634           0 :             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
     635             :                         CurrScore);
     636             :           }
     637             :         }
     638             :       }
     639        4450 :       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     640        4005 :         MachineOperand &MO = Inst.getOperand(I);
     641        6230 :         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
     642        1780 :           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     643             :         }
     644             :       }
     645             :     }
     646             : #if 0 // TODO: check if this is handled by MUBUF code above.
     647             :   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
     648             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
     649             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     650             :     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     651             :     unsigned OpNo;//TODO: find the OpNo for this operand;
     652             :     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     653             :     for (signed RegNo = Interval.first; RegNo < Interval.second;
     654             :     ++RegNo) {
     655             :       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     656             :     }
     657             : #endif
     658             :   } else {
     659             :     // Match the score to the destination registers.
     660      695452 :     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     661      604809 :       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
     662      604809 :       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
     663           0 :         continue;
     664      842231 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     665             :         setRegScore(RegNo, T, CurrScore);
     666             :       }
     667             :     }
     668       90643 :     if (TII->isDS(Inst) && Inst.mayStore()) {
     669             :       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     670             :     }
     671             :   }
     672      101323 : }
     673             : 
     674             : void BlockWaitcntBrackets::print(raw_ostream &OS) {
     675             :   OS << '\n';
     676             :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     677             :        T = (enum InstCounterType)(T + 1)) {
     678             :     int LB = getScoreLB(T);
     679             :     int UB = getScoreUB(T);
     680             : 
     681             :     switch (T) {
     682             :     case VM_CNT:
     683             :       OS << "    VM_CNT(" << UB - LB << "): ";
     684             :       break;
     685             :     case LGKM_CNT:
     686             :       OS << "    LGKM_CNT(" << UB - LB << "): ";
     687             :       break;
     688             :     case EXP_CNT:
     689             :       OS << "    EXP_CNT(" << UB - LB << "): ";
     690             :       break;
     691             :     default:
     692             :       OS << "    UNKNOWN(" << UB - LB << "): ";
     693             :       break;
     694             :     }
     695             : 
     696             :     if (LB < UB) {
     697             :       // Print vgpr scores.
     698             :       for (int J = 0; J <= getMaxVGPR(); J++) {
     699             :         int RegScore = getRegScore(J, T);
     700             :         if (RegScore <= LB)
     701             :           continue;
     702             :         int RelScore = RegScore - LB - 1;
     703             :         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
     704             :           OS << RelScore << ":v" << J << " ";
     705             :         } else {
     706             :           OS << RelScore << ":ds ";
     707             :         }
     708             :       }
     709             :       // Also need to print sgpr scores for lgkm_cnt.
     710             :       if (T == LGKM_CNT) {
     711             :         for (int J = 0; J <= getMaxSGPR(); J++) {
     712             :           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
     713             :           if (RegScore <= LB)
     714             :             continue;
     715             :           int RelScore = RegScore - LB - 1;
     716             :           OS << RelScore << ":s" << J << " ";
     717             :         }
     718             :       }
     719             :     }
     720             :     OS << '\n';
     721             :   }
     722             :   OS << '\n';
     723             : }
     724             : 
     725     2639939 : unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
     726             :                                                 int ScoreToWait) {
     727             :   unsigned int NeedWait = 0;
     728     2639939 :   if (ScoreToWait == -1) {
     729             :     // The score to wait is unknown. This implies that it was not encountered
     730             :     // during the path of the CFG walk done during the current traversal but
     731             :     // may be seen on a different path. Emit an s_wait counter with a
     732             :     // conservative value of 0 for the counter.
     733           0 :     NeedWait = CNT_MASK(T);
     734             :     setScoreLB(T, getScoreUB(T));
     735             :     return NeedWait;
     736             :   }
     737             : 
     738             :   // If the score of src_operand falls within the bracket, we need an
     739             :   // s_waitcnt instruction.
     740             :   const int32_t LB = getScoreLB(T);
     741             :   const int32_t UB = getScoreUB(T);
     742     2639939 :   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     743             :     if ((T == VM_CNT || T == LGKM_CNT) &&
     744       39792 :         hasPendingFlat() &&
     745        1252 :         !ST->hasFlatLgkmVMemCountInOrder()) {
     746             :       // If there is a pending FLAT operation, and this is a VMem or LGKM
     747             :       // waitcnt and the target can report early completion, then we need
     748             :       // to force a waitcnt 0.
     749        1252 :       NeedWait = CNT_MASK(T);
     750             :       setScoreLB(T, getScoreUB(T));
     751       37288 :     } else if (counterOutOfOrder(T)) {
     752             :       // Counter can get decremented out-of-order when there
     753             :       // are multiple types event in the bracket. Also emit an s_wait counter
     754             :       // with a conservative value of 0 for the counter.
     755       15772 :       NeedWait = CNT_MASK(T);
     756             :       setScoreLB(T, getScoreUB(T));
     757             :     } else {
     758       21516 :       NeedWait = CNT_MASK(T);
     759             :       setScoreLB(T, ScoreToWait);
     760             :     }
     761             :   }
     762             : 
     763             :   return NeedWait;
     764             : }
     765             : 
     766             : // Where there are multiple types of event in the bracket of a counter,
     767             : // the decrement may go out of order.
     768       47968 : bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
     769       47968 :   switch (T) {
     770             :   case VM_CNT:
     771             :     return false;
     772       20470 :   case LGKM_CNT: {
     773       36242 :     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     774       15772 :         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     775             :       // Scalar memory read always can go out of order.
     776             :       return true;
     777             :     }
     778             :     int NumEventTypes = 0;
     779        9396 :     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     780        4698 :         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     781             :       NumEventTypes++;
     782             :     }
     783        4698 :     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     784           0 :         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     785           0 :       NumEventTypes++;
     786             :     }
     787        4698 :     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
     788           0 :         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
     789           0 :       NumEventTypes++;
     790             :     }
     791        4698 :     if (NumEventTypes <= 1) {
     792             :       return false;
     793             :     }
     794             :     break;
     795             :   }
     796       13694 :   case EXP_CNT: {
     797             :     // If there has been a mixture of export types, then a waitcnt exp(0) is
     798             :     // required.
     799       13694 :     if (MixedExpTypes)
     800             :       return true;
     801             :     int NumEventTypes = 0;
     802       13991 :     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     803         297 :         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     804             :       NumEventTypes++;
     805             :     }
     806       13694 :     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     807           0 :         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     808           0 :       NumEventTypes++;
     809             :     }
     810       26930 :     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     811       13236 :         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     812       13236 :       NumEventTypes++;
     813             :     }
     814       13822 :     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
     815         128 :         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
     816         128 :       NumEventTypes++;
     817             :     }
     818             : 
     819       13735 :     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
     820          41 :         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
     821          41 :       NumEventTypes++;
     822             :     }
     823             : 
     824       13694 :     if (NumEventTypes <= 1) {
     825             :       return false;
     826             :     }
     827             :     break;
     828             :   }
     829             :   default:
     830             :     break;
     831             :   }
     832           8 :   return true;
     833             : }
     834             : 
     835       73254 : INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     836             :                       false)
     837      342570 : INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     838             :                     false)
     839             : 
     840             : char SIInsertWaitcnts::ID = 0;
     841             : 
     842             : char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
     843             : 
     844        1795 : FunctionPass *llvm::createSIInsertWaitcntsPass() {
     845        1795 :   return new SIInsertWaitcnts();
     846             : }
     847             : 
     848             : static bool readsVCCZ(const MachineInstr &MI) {
     849      602089 :   unsigned Opc = MI.getOpcode();
     850      602699 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     851         610 :          !MI.getOperand(1).isUndef();
     852             : }
     853             : 
     854             : /// Given wait count encodings checks if LHS is stronger than RHS.
     855        1120 : bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
     856        1120 :   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
     857             :     return false;
     858        1120 :   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
     859             :     return false;
     860        1119 :   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
     861             :     return false;
     862        1119 :   return true;
     863             : }
     864             : 
     865             : /// Given wait count encodings create a new encoding which is stronger
     866             : /// or equal to both.
     867           0 : unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
     868           0 :   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
     869           0 :                             AMDGPU::decodeVmcnt(IV, RHS));
     870           0 :   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
     871           0 :                               AMDGPU::decodeLgkmcnt(IV, RHS));
     872           0 :   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
     873           0 :                              AMDGPU::decodeExpcnt(IV, RHS));
     874           0 :   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
     875             : }
     876             : 
     877             : ///  Generate s_waitcnt instruction to be placed before cur_Inst.
     878             : ///  Instructions of a given type are returned in order,
     879             : ///  but instructions of different types can complete out of order.
     880             : ///  We rely on this in-order completion
     881             : ///  and simply assign a score to the memory access instructions.
     882             : ///  We keep track of the active "score bracket" to determine
     883             : ///  if an access of a memory read requires an s_waitcnt
     884             : ///  and if so what the value of each counter is.
     885             : ///  The "score bracket" is bound by the lower bound and upper bound
     886             : ///  scores (*_score_LB and *_score_ub respectively).
     887      301054 : void SIInsertWaitcnts::generateWaitcntInstBefore(
     888             :     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
     889             :   // To emit, or not to emit - that's the question!
     890             :   // Start with an assumption that there is no need to emit.
     891             :   unsigned int EmitWaitcnt = 0;
     892             : 
     893             :   // No need to wait before phi. If a phi-move exists, then the wait should
     894             :   // has been inserted before the move. If a phi-move does not exist, then
     895             :   // wait should be inserted before the real use. The same is true for
     896             :   // sc-merge. It is not a coincident that all these cases correspond to the
     897             :   // instructions that are skipped in the assembling loop.
     898             :   bool NeedLineMapping = false; // TODO: Check on this.
     899             : 
     900             :   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
     901             :   bool ForceEmitZeroWaitcnt = false;
     902             : 
     903             :   setForceEmitWaitcnt();
     904             :   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
     905             : 
     906             :   if (MI.isDebugInstr() &&
     907             :       // TODO: any other opcode?
     908             :       !NeedLineMapping) {
     909             :     return;
     910             :   }
     911             : 
     912             :   // See if an s_waitcnt is forced at block entry, or is needed at
     913             :   // program end.
     914      301035 :   if (ScoreBrackets->getWaitAtBeginning()) {
     915             :     // Note that we have already cleared the state, so we don't need to update
     916             :     // it.
     917             :     ScoreBrackets->clearWaitAtBeginning();
     918           0 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     919           0 :          T = (enum InstCounterType)(T + 1)) {
     920           0 :       EmitWaitcnt |= CNT_MASK(T);
     921             :       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     922             :     }
     923             :   }
     924             : 
     925             :   // See if this instruction has a forced S_WAITCNT VM.
     926             :   // TODO: Handle other cases of NeedsWaitcntVmBefore()
     927      300919 :   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
     928      601953 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
     929             :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     930             :     EmitWaitcnt |=
     931        1135 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     932             :   }
     933             : 
     934             :   // All waits must be resolved at call return.
     935             :   // NOTE: this could be improved with knowledge of all call sites or
     936             :   //   with knowledge of the called routines.
     937      602070 :   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
     938             :       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     939       15197 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     940        6513 :          T = (enum InstCounterType)(T + 1)) {
     941        6513 :       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
     942             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     943        1617 :         EmitWaitcnt |= CNT_MASK(T);
     944             :       }
     945             :     }
     946             :   }
     947             :   // Resolve vm waits before gs-done.
     948      298850 :   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
     949      298890 :             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
     950          26 :            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
     951             :             AMDGPU::SendMsg::ID_GS_DONE)) {
     952           6 :     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
     953             :       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     954           0 :       EmitWaitcnt |= CNT_MASK(VM_CNT);
     955             :     }
     956             :   }
     957             : #if 0 // TODO: the following blocks of logic when we have fence.
     958             :   else if (MI.getOpcode() == SC_FENCE) {
     959             :     const unsigned int group_size =
     960             :       context->shader_info->GetMaxThreadGroupSize();
     961             :     // group_size == 0 means thread group size is unknown at compile time
     962             :     const bool group_is_multi_wave =
     963             :       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
     964             :     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
     965             : 
     966             :     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
     967             :       SCRegType src_type = Inst->GetSrcType(i);
     968             :       switch (src_type) {
     969             :         case SCMEM_LDS:
     970             :           if (group_is_multi_wave ||
     971             :             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
     972             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     973             :                                ScoreBrackets->getScoreUB(LGKM_CNT));
     974             :             // LDS may have to wait for VM_CNT after buffer load to LDS
     975             :             if (target_info->HasBufferLoadToLDS()) {
     976             :               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     977             :                                  ScoreBrackets->getScoreUB(VM_CNT));
     978             :             }
     979             :           }
     980             :           break;
     981             : 
     982             :         case SCMEM_GDS:
     983             :           if (group_is_multi_wave || fence_is_global) {
     984             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     985             :               ScoreBrackets->getScoreUB(EXP_CNT));
     986             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     987             :               ScoreBrackets->getScoreUB(LGKM_CNT));
     988             :           }
     989             :           break;
     990             : 
     991             :         case SCMEM_UAV:
     992             :         case SCMEM_TFBUF:
     993             :         case SCMEM_RING:
     994             :         case SCMEM_SCATTER:
     995             :           if (group_is_multi_wave || fence_is_global) {
     996             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     997             :               ScoreBrackets->getScoreUB(EXP_CNT));
     998             :             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     999             :               ScoreBrackets->getScoreUB(VM_CNT));
    1000             :           }
    1001             :           break;
    1002             : 
    1003             :         case SCMEM_SCRATCH:
    1004             :         default:
    1005             :           break;
    1006             :       }
    1007             :     }
    1008             :   }
    1009             : #endif
    1010             : 
    1011             :   // Export & GDS instructions do not read the EXEC mask until after the export
    1012             :   // is granted (which can occur well after the instruction is issued).
    1013             :   // The shader program must flush all EXP operations on the export-count
    1014             :   // before overwriting the EXEC mask.
    1015             :   else {
    1016      597716 :     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
    1017             :       // Export and GDS are tracked individually, either may trigger a waitcnt
    1018             :       // for EXEC.
    1019        2326 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1020             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
    1021        2326 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1022             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
    1023        2326 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1024             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
    1025        2326 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1026             :           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
    1027             :     }
    1028             : 
    1029             : #if 0 // TODO: the following code to handle CALL.
    1030             :     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
    1031             :     // However, there is a problem with EXP_CNT, because the call cannot
    1032             :     // easily tell if a register is used in the function, and if it did, then
    1033             :     // the referring instruction would have to have an S_WAITCNT, which is
    1034             :     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
    1035             :     // before the call.
    1036             :     if (MI.getOpcode() == SC_CALL) {
    1037             :       if (ScoreBrackets->getScoreUB(EXP_CNT) >
    1038             :         ScoreBrackets->getScoreLB(EXP_CNT)) {
    1039             :         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1040             :         EmitWaitcnt |= CNT_MASK(EXP_CNT);
    1041             :       }
    1042             :     }
    1043             : #endif
    1044             : 
    1045             :     // FIXME: Should not be relying on memoperands.
    1046             :     // Look at the source operands of every instruction to see if
    1047             :     // any of them results from a previous memory operation that affects
    1048             :     // its current usage. If so, an s_waitcnt instruction needs to be
    1049             :     // emitted.
    1050             :     // If the source operand was defined by a load, add the s_waitcnt
    1051             :     // instruction.
    1052      467880 :     for (const MachineMemOperand *Memop : MI.memoperands()) {
    1053             :       unsigned AS = Memop->getAddrSpace();
    1054       84511 :       if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1055       73135 :         continue;
    1056             :       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1057             :       // VM_CNT is only relevant to vgpr or LDS.
    1058       11376 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1059             :           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1060             :     }
    1061             : 
    1062     1607411 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1063     1308553 :       const MachineOperand &Op = MI.getOperand(I);
    1064     1308553 :       const MachineRegisterInfo &MRIA = *MRI;
    1065             :       RegInterval Interval =
    1066     1308553 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
    1067     2751018 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1068     1442465 :         if (TRI->isVGPR(MRIA, Op.getReg())) {
    1069             :           // VM_CNT is only relevant to vgpr or LDS.
    1070      413833 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1071             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1072             :         }
    1073     1442465 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1074             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1075             :       }
    1076             :     }
    1077             :     // End of for loop that looks at all source operands to decide vm_wait_cnt
    1078             :     // and lgk_wait_cnt.
    1079             : 
    1080             :     // Two cases are handled for destination operands:
    1081             :     // 1) If the destination operand was defined by a load, add the s_waitcnt
    1082             :     // instruction to guarantee the right WAW order.
    1083             :     // 2) If a destination operand that was used by a recent export/store ins,
    1084             :     // add s_waitcnt on exp_cnt to guarantee the WAR order.
    1085      298858 :     if (MI.mayStore()) {
    1086             :       // FIXME: Should not be relying on memoperands.
    1087      110894 :       for (const MachineMemOperand *Memop : MI.memoperands()) {
    1088             :         unsigned AS = Memop->getAddrSpace();
    1089       37038 :         if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1090       30148 :           continue;
    1091             :         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1092        6890 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1093             :             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1094        6890 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1095             :             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1096             :       }
    1097             :     }
    1098     1607411 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1099     1308553 :       MachineOperand &Def = MI.getOperand(I);
    1100     1308553 :       const MachineRegisterInfo &MRIA = *MRI;
    1101             :       RegInterval Interval =
    1102     1308553 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
    1103     1688328 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1104      379775 :         if (TRI->isVGPR(MRIA, Def.getReg())) {
    1105      184038 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1106             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1107      184038 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1108             :               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1109             :         }
    1110      379775 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1111             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1112             :       }
    1113             :     } // End of for loop that looks at all dest operands.
    1114             :   }
    1115             : 
    1116             :   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
    1117             :   // occurs before the instruction. Doing it here prevents any additional
    1118             :   // S_WAITCNTs from being emitted if the instruction was marked as
    1119             :   // requiring a WAITCNT beforehand.
    1120      602137 :   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
    1121          67 :       !ST->hasAutoWaitcntBeforeBarrier()) {
    1122          65 :     EmitWaitcnt |=
    1123          65 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1124          65 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1125             :         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1126          65 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1127             :         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
    1128             :   }
    1129             : 
    1130             :   // TODO: Remove this work-around, enable the assert for Bug 457939
    1131             :   //       after fixing the scheduler. Also, the Shader Compiler code is
    1132             :   //       independent of target.
    1133         304 :   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
    1134             :     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1135         201 :             ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1136          25 :         ScoreBrackets->hasPendingSMEM()) {
    1137             :       // Wait on everything, not just LGKM.  vccz reads usually come from
    1138             :       // terminators, and we always wait on everything at the end of the
    1139             :       // block, so if we only wait on LGKM here, we might end up with
    1140             :       // another s_waitcnt inserted right after this if there are non-LGKM
    1141             :       // instructions still outstanding.
    1142             :       // FIXME: this is too conservative / the comment is wrong.
    1143             :       // We don't wait on everything at the end of the block and we combine
    1144             :       // waitcnts so we should never have back-to-back waitcnts.
    1145             :       ForceEmitZeroWaitcnt = true;
    1146             :       EmitWaitcnt = true;
    1147             :     }
    1148             :   }
    1149             : 
    1150             :   // Does this operand processing indicate s_wait counter update?
    1151      301035 :   if (EmitWaitcnt || IsForceEmitWaitcnt) {
    1152             :     int CntVal[NUM_INST_CNTS];
    1153             : 
    1154             :     bool UseDefaultWaitcntStrategy = true;
    1155       34906 :     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
    1156             :       // Force all waitcnts to 0.
    1157          28 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1158          12 :            T = (enum InstCounterType)(T + 1)) {
    1159             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1160             :       }
    1161           4 :       CntVal[VM_CNT] = 0;
    1162           4 :       CntVal[EXP_CNT] = 0;
    1163           4 :       CntVal[LGKM_CNT] = 0;
    1164             :       UseDefaultWaitcntStrategy = false;
    1165             :     }
    1166             : 
    1167             :     if (UseDefaultWaitcntStrategy) {
    1168      244314 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1169      104706 :            T = (enum InstCounterType)(T + 1)) {
    1170      104706 :         if (EmitWaitcnt & CNT_MASK(T)) {
    1171             :           int Delta =
    1172       37514 :               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
    1173             :           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
    1174       37514 :           if (Delta >= MaxDelta) {
    1175             :             Delta = -1;
    1176          10 :             if (T != EXP_CNT) {
    1177          10 :               ScoreBrackets->setScoreLB(
    1178             :                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
    1179             :             }
    1180          10 :             EmitWaitcnt &= ~CNT_MASK(T);
    1181             :           }
    1182       37514 :           CntVal[T] = Delta;
    1183             :         } else {
    1184             :           // If we are not waiting for a particular counter then encode
    1185             :           // it as -1 which means "don't care."
    1186       67192 :           CntVal[T] = -1;
    1187             :         }
    1188             :       }
    1189             :     }
    1190             : 
    1191             :     // If we are not waiting on any counter we can skip the wait altogether.
    1192       34906 :     if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
    1193       34898 :       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
    1194       34898 :       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
    1195        1340 :       if (!OldWaitcnt ||
    1196        1340 :           (AMDGPU::decodeVmcnt(IV, Imm) !=
    1197        2674 :                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
    1198        1334 :           (AMDGPU::decodeExpcnt(IV, Imm) !=
    1199       37548 :            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
    1200        1316 :           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
    1201        1316 :            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
    1202       33582 :         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
    1203         236 :         if (ContainingLoop) {
    1204         236 :           MachineBasicBlock *TBB = ContainingLoop->getHeader();
    1205             :           BlockWaitcntBrackets *ScoreBracket =
    1206         236 :               BlockWaitcntBracketsMap[TBB].get();
    1207         236 :           if (!ScoreBracket) {
    1208             :             assert(!BlockVisitedSet.count(TBB));
    1209             :             BlockWaitcntBracketsMap[TBB] =
    1210           0 :                 llvm::make_unique<BlockWaitcntBrackets>(ST);
    1211             :             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
    1212             :           }
    1213             :           ScoreBracket->setRevisitLoop(true);
    1214             :           LLVM_DEBUG(dbgs()
    1215             :                          << "set-revisit2: Block"
    1216             :                          << ContainingLoop->getHeader()->getNumber() << '\n';);
    1217             :         }
    1218             :       }
    1219             : 
    1220             :       // Update an existing waitcount, or make a new one.
    1221      209388 :       unsigned Enc = AMDGPU::encodeWaitcnt(IV,
    1222       69796 :                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
    1223       69796 :                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
    1224      104694 :                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
    1225             :       // We don't remove waitcnts that existed prior to the waitcnt
    1226             :       // pass. Check if the waitcnt to-be-inserted can be avoided
    1227             :       // or if the prev waitcnt can be updated.
    1228             :       bool insertSWaitInst = true;
    1229       34898 :       for (MachineBasicBlock::iterator I = MI.getIterator(),
    1230       34898 :                                        B = MI.getParent()->begin();
    1231       69549 :            insertSWaitInst && I != B; --I) {
    1232       67206 :         if (I == MI.getIterator())
    1233             :           continue;
    1234             : 
    1235       65110 :         switch (I->getOpcode()) {
    1236             :         case AMDGPU::S_WAITCNT:
    1237        1120 :           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
    1238             :             insertSWaitInst = false;
    1239           1 :           else if (!OldWaitcnt) {
    1240             :             OldWaitcnt = &*I;
    1241           0 :             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
    1242             :           }
    1243             :           break;
    1244             :         // TODO: skip over instructions which never require wait.
    1245             :         }
    1246             :         break;
    1247             :       }
    1248       34898 :       if (insertSWaitInst) {
    1249       34000 :         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
    1250             :           if (ForceEmitZeroWaitcnts)
    1251             :             LLVM_DEBUG(
    1252             :                 dbgs()
    1253             :                 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
    1254             :           if (IsForceEmitWaitcnt)
    1255             :             LLVM_DEBUG(dbgs()
    1256             :                        << "Force emit a s_waitcnt due to debug counter\n");
    1257             : 
    1258         221 :           OldWaitcnt->getOperand(0).setImm(Enc);
    1259         221 :           if (!OldWaitcnt->getParent())
    1260         220 :             MI.getParent()->insert(MI, OldWaitcnt);
    1261             : 
    1262             :           LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
    1263             :                             << "Old Instr: " << MI << '\n'
    1264             :                             << "New Instr: " << *OldWaitcnt << '\n');
    1265             :         } else {
    1266       33558 :             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
    1267       33558 :                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1268       33558 :                              .addImm(Enc);
    1269       67116 :             TrackedWaitcntSet.insert(SWaitInst);
    1270             : 
    1271             :             LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
    1272             :                               << "Old Instr: " << MI << '\n'
    1273             :                               << "New Instr: " << *SWaitInst << '\n');
    1274             :         }
    1275             :       }
    1276             : 
    1277       34898 :       if (CntVal[EXP_CNT] == 0) {
    1278             :         ScoreBrackets->setMixedExpTypes(false);
    1279             :       }
    1280             :     }
    1281             :   }
    1282             : }
    1283             : 
    1284           0 : void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
    1285             :                                              MachineInstr *Waitcnt) {
    1286           0 :   if (MBB.empty()) {
    1287             :     MBB.push_back(Waitcnt);
    1288           0 :     return;
    1289             :   }
    1290             : 
    1291           0 :   MachineBasicBlock::iterator It = MBB.end();
    1292             :   MachineInstr *MI = &*(--It);
    1293           0 :   if (MI->isBranch()) {
    1294             :     MBB.insert(It, Waitcnt);
    1295             :   } else {
    1296             :     MBB.push_back(Waitcnt);
    1297             :   }
    1298             : }
    1299             : 
    1300             : // This is a flat memory operation. Check to see if it has memory
    1301             : // tokens for both LDS and Memory, and if so mark it as a flat.
    1302        8955 : bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
    1303        8955 :   if (MI.memoperands_empty())
    1304             :     return true;
    1305             : 
    1306       23609 :   for (const MachineMemOperand *Memop : MI.memoperands()) {
    1307             :     unsigned AS = Memop->getAddrSpace();
    1308        8931 :     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
    1309             :       return true;
    1310             :   }
    1311             : 
    1312             :   return false;
    1313             : }
    1314             : 
    1315      301054 : void SIInsertWaitcnts::updateEventWaitcntAfter(
    1316             :     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
    1317             :   // Now look at the instruction opcode. If it is a memory access
    1318             :   // instruction, update the upper-bound of the appropriate counter's
    1319             :   // bracket and the destination operand scores.
    1320             :   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
    1321      309482 :   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    1322        8428 :     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
    1323           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
    1324           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    1325             :     } else {
    1326        8428 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1327             :     }
    1328      292626 :   } else if (TII->isFLAT(Inst)) {
    1329             :     assert(Inst.mayLoad() || Inst.mayStore());
    1330             : 
    1331       11597 :     if (TII->usesVM_CNT(Inst))
    1332       11597 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1333             : 
    1334       11597 :     if (TII->usesLGKM_CNT(Inst)) {
    1335        8955 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1336             : 
    1337             :       // This is a flat memory operation, so note it - it will require
    1338             :       // that both the VM and LGKM be flushed to zero if it is pending when
    1339             :       // a VM or LGKM dependency occurs.
    1340        8955 :       if (mayAccessLDSThroughFlat(Inst))
    1341             :         ScoreBrackets->setPendingFlat();
    1342             :     }
    1343       38412 :   } else if (SIInstrInfo::isVMEM(Inst) &&
    1344             :              // TODO: get a better carve out.
    1345       38296 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
    1346       38295 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
    1347             :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
    1348       37277 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1349       92051 :     if (ST->vmemWriteNeedsExpWaitcnt() &&
    1350       32021 :         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
    1351       10235 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    1352             :     }
    1353      243752 :   } else if (TII->isSMRD(Inst)) {
    1354       24360 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1355             :   } else {
    1356      219392 :     switch (Inst.getOpcode()) {
    1357          26 :     case AMDGPU::S_SENDMSG:
    1358             :     case AMDGPU::S_SENDMSGHALT:
    1359          26 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
    1360          26 :       break;
    1361         445 :     case AMDGPU::EXP:
    1362             :     case AMDGPU::EXP_DONE: {
    1363         445 :       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    1364         445 :       if (Imm >= 32 && Imm <= 63)
    1365         120 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    1366         325 :       else if (Imm >= 12 && Imm <= 15)
    1367          37 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    1368             :       else
    1369         288 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
    1370             :       break;
    1371             :     }
    1372           0 :     case AMDGPU::S_MEMTIME:
    1373             :     case AMDGPU::S_MEMREALTIME:
    1374           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1375           0 :       break;
    1376             :     default:
    1377             :       break;
    1378             :     }
    1379             :   }
    1380      301054 : }
    1381             : 
    1382             : // Merge the score brackets of the Block's predecessors;
    1383             : // this merged score bracket is used when adding waitcnts to the Block
    1384       20480 : void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
    1385       40960 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1386       20480 :   int32_t MaxPending[NUM_INST_CNTS] = {0};
    1387       20480 :   int32_t MaxFlat[NUM_INST_CNTS] = {0};
    1388             :   bool MixedExpTypes = false;
    1389             : 
    1390             :   // For single basic block loops, we need to retain the Block's
    1391             :   // score bracket to have accurate Pred info. So, make a copy of Block's
    1392             :   // score bracket, clear() it (which retains several important bits of info),
    1393             :   // populate, and then replace en masse. For non-single basic block loops,
    1394             :   // just clear Block's current score bracket and repopulate in-place.
    1395             :   bool IsSelfPred;
    1396             :   std::unique_ptr<BlockWaitcntBrackets> S;
    1397             : 
    1398       61440 :   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
    1399             :     != Block.pred_end();
    1400       20480 :   if (IsSelfPred) {
    1401         608 :     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1402             :     ScoreBrackets = S.get();
    1403             :   }
    1404             : 
    1405       20480 :   ScoreBrackets->clear();
    1406             : 
    1407             :   // See if there are any uninitialized predecessors. If so, emit an
    1408             :   // s_waitcnt 0 at the beginning of the block.
    1409       24404 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1410             :     BlockWaitcntBrackets *PredScoreBrackets =
    1411             :         BlockWaitcntBracketsMap[Pred].get();
    1412        3924 :     bool Visited = BlockVisitedSet.count(Pred);
    1413        3828 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1414          96 :       continue;
    1415             :     }
    1416       26796 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1417       11484 :          T = (enum InstCounterType)(T + 1)) {
    1418             :       int span =
    1419       11484 :           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
    1420       22968 :       MaxPending[T] = std::max(MaxPending[T], span);
    1421       11484 :       span =
    1422       11484 :           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
    1423       22968 :       MaxFlat[T] = std::max(MaxFlat[T], span);
    1424             :     }
    1425             : 
    1426        3828 :     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
    1427             :   }
    1428             : 
    1429             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1430             :   // Also handle kills for exit block.
    1431       38414 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1432        2451 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1433        5313 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1434        2277 :            T = (enum InstCounterType)(T + 1)) {
    1435        2277 :         int Span = KillWaitBrackets[I]->getScoreUB(T) -
    1436        2277 :                    KillWaitBrackets[I]->getScoreLB(T);
    1437        4554 :         MaxPending[T] = std::max(MaxPending[T], Span);
    1438        2277 :         Span = KillWaitBrackets[I]->pendingFlat(T) -
    1439             :                KillWaitBrackets[I]->getScoreLB(T);
    1440        4554 :         MaxFlat[T] = std::max(MaxFlat[T], Span);
    1441             :       }
    1442             : 
    1443         759 :       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
    1444             :     }
    1445             :   }
    1446             : 
    1447             :   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
    1448       24404 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1449             :     BlockWaitcntBrackets *PredScoreBrackets =
    1450             :         BlockWaitcntBracketsMap[Pred].get();
    1451        3924 :     bool Visited = BlockVisitedSet.count(Pred);
    1452        3924 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1453          96 :       continue;
    1454             :     }
    1455             : 
    1456        3828 :     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
    1457        3828 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1458        3828 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1459        3828 :     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
    1460        3828 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1461        3828 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1462             :   }
    1463             : 
    1464             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1465       38414 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1466        2451 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1467         759 :       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
    1468         759 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1469         759 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1470         759 :       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
    1471         759 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1472         759 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1473             :     }
    1474             :   }
    1475             : 
    1476             : #if 0
    1477             :   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
    1478             :   // TODO: how does LC distinguish between function entry and main entry?
    1479             :   // If this is the entry to a function, force a wait.
    1480             :   MachineBasicBlock &Entry = Block.getParent()->front();
    1481             :   if (Entry.getNumber() == Block.getNumber()) {
    1482             :     ScoreBrackets->setWaitAtBeginning();
    1483             :     return;
    1484             :   }
    1485             : #endif
    1486             : 
    1487             :   // Now set the current Block's brackets to the largest ending bracket.
    1488      143360 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1489       61440 :        T = (enum InstCounterType)(T + 1)) {
    1490       61440 :     ScoreBrackets->setScoreUB(T, MaxPending[T]);
    1491             :     ScoreBrackets->setScoreLB(T, 0);
    1492       61440 :     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
    1493             :   }
    1494             : 
    1495             :   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
    1496             : 
    1497             :   // Set the register scoreboard.
    1498       24404 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1499          96 :     if (!BlockVisitedSet.count(Pred)) {
    1500          96 :       continue;
    1501             :     }
    1502             : 
    1503             :     BlockWaitcntBrackets *PredScoreBrackets =
    1504             :         BlockWaitcntBracketsMap[Pred].get();
    1505             : 
    1506             :     // Now merge the gpr_reg_score information
    1507       26796 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1508       11484 :          T = (enum InstCounterType)(T + 1)) {
    1509             :       int PredLB = PredScoreBrackets->getScoreLB(T);
    1510             :       int PredUB = PredScoreBrackets->getScoreUB(T);
    1511       11484 :       if (PredLB < PredUB) {
    1512        2147 :         int PredScale = MaxPending[T] - PredUB;
    1513             :         // Merge vgpr scores.
    1514      143699 :         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
    1515             :           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
    1516       70776 :           if (PredRegScore <= PredLB)
    1517       65129 :             continue;
    1518        5647 :           int NewRegScore = PredScale + PredRegScore;
    1519        5647 :           ScoreBrackets->setRegScore(
    1520       11294 :               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1521             :         }
    1522             :         // Also need to merge sgpr scores for lgkm_cnt.
    1523        2147 :         if (T == LGKM_CNT) {
    1524        8574 :           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
    1525             :             int PredRegScore =
    1526        3905 :                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1527        3905 :             if (PredRegScore <= PredLB)
    1528        2485 :               continue;
    1529        1420 :             int NewRegScore = PredScale + PredRegScore;
    1530        1420 :             ScoreBrackets->setRegScore(
    1531             :                 J + NUM_ALL_VGPRS, LGKM_CNT,
    1532             :                 std::max(
    1533        2840 :                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1534             :                     NewRegScore));
    1535             :           }
    1536             :         }
    1537             :       }
    1538             :     }
    1539             : 
    1540             :     // Also merge the WaitEvent information.
    1541       80388 :     ForAllWaitEventType(W) {
    1542             :       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
    1543             :       int PredEventUB = PredScoreBrackets->getEventUB(W);
    1544       38280 :       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
    1545             :         int NewEventUB =
    1546        4300 :             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
    1547        2150 :         if (NewEventUB > 0) {
    1548        2150 :           ScoreBrackets->setEventUB(
    1549        4300 :               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1550             :         }
    1551             :       }
    1552             :     }
    1553             :   }
    1554             : 
    1555             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1556             :   // Set the register scoreboard.
    1557       38414 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1558        2451 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1559             :       // Now merge the gpr_reg_score information.
    1560        5313 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1561        2277 :            T = (enum InstCounterType)(T + 1)) {
    1562        2277 :         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
    1563             :         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
    1564        2277 :         if (PredLB < PredUB) {
    1565          28 :           int PredScale = MaxPending[T] - PredUB;
    1566             :           // Merge vgpr scores.
    1567         230 :           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
    1568             :             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
    1569          58 :             if (PredRegScore <= PredLB)
    1570          50 :               continue;
    1571           8 :             int NewRegScore = PredScale + PredRegScore;
    1572           8 :             ScoreBrackets->setRegScore(
    1573          16 :                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1574             :           }
    1575             :           // Also need to merge sgpr scores for lgkm_cnt.
    1576          28 :           if (T == LGKM_CNT) {
    1577         360 :             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
    1578             :               int PredRegScore =
    1579         104 :                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1580         104 :               if (PredRegScore <= PredLB)
    1581          56 :                 continue;
    1582          48 :               int NewRegScore = PredScale + PredRegScore;
    1583          48 :               ScoreBrackets->setRegScore(
    1584             :                   J + NUM_ALL_VGPRS, LGKM_CNT,
    1585             :                   std::max(
    1586          96 :                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1587             :                       NewRegScore));
    1588             :             }
    1589             :           }
    1590             :         }
    1591             :       }
    1592             : 
    1593             :       // Also merge the WaitEvent information.
    1594       15939 :       ForAllWaitEventType(W) {
    1595        7590 :         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
    1596             :         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
    1597        7590 :         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
    1598             :           int NewEventUB =
    1599          56 :               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
    1600          28 :           if (NewEventUB > 0) {
    1601          28 :             ScoreBrackets->setEventUB(
    1602          56 :                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1603             :           }
    1604             :         }
    1605             :       }
    1606             :     }
    1607             :   }
    1608             : 
    1609             :   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
    1610             :   // sequencing predecessors, because changes to EXEC require waitcnts due to
    1611             :   // the delayed nature of these operations.
    1612       24404 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1613          96 :     if (!BlockVisitedSet.count(Pred)) {
    1614          96 :       continue;
    1615             :     }
    1616             : 
    1617             :     BlockWaitcntBrackets *PredScoreBrackets =
    1618             :         BlockWaitcntBracketsMap[Pred].get();
    1619             : 
    1620             :     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
    1621        3828 :     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1622           0 :       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
    1623           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1624           0 :       if (new_gds_ub > 0) {
    1625           0 :         ScoreBrackets->setEventUB(
    1626             :             GDS_GPR_LOCK,
    1627           0 :             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
    1628             :       }
    1629             :     }
    1630             :     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
    1631        3828 :     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1632           4 :       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
    1633           2 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1634           2 :       if (new_exp_ub > 0) {
    1635           2 :         ScoreBrackets->setEventUB(
    1636             :             EXP_GPR_LOCK,
    1637           4 :             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
    1638             :       }
    1639             :     }
    1640             :   }
    1641             : 
    1642             :   // if a single block loop, update the score brackets. Not needed for other
    1643             :   // blocks, as we did this in-place
    1644       20480 :   if (IsSelfPred) {
    1645         912 :     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1646             :   }
    1647       20480 : }
    1648             : 
    1649             : /// Return true if the given basic block is a "bottom" block of a loop.
    1650             : /// This works even if the loop is discontiguous. This also handles
    1651             : /// multiple back-edges for the same "header" block of a loop.
    1652        1626 : bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
    1653             :                                     const MachineBasicBlock *Block) {
    1654        9190 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1655        6382 :     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
    1656             :       return true;
    1657             :     }
    1658             :   }
    1659             :   return false;
    1660             : }
    1661             : 
    1662             : /// Count the number of "bottom" basic blocks of a loop.
    1663         945 : unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
    1664             :   unsigned Count = 0;
    1665        4695 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1666        1875 :     if (MBB->isSuccessor(Loop->getHeader())) {
    1667        1061 :       Count++;
    1668             :     }
    1669             :   }
    1670         945 :   return Count;
    1671             : }
    1672             : 
    1673             : // Generate s_waitcnt instructions where needed.
    1674       20480 : void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    1675             :                                             MachineBasicBlock &Block) {
    1676             :   // Initialize the state information.
    1677       20480 :   mergeInputScoreBrackets(Block);
    1678             : 
    1679       40960 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1680             : 
    1681             :   LLVM_DEBUG({
    1682             :     dbgs() << "*** Block" << Block.getNumber() << " ***";
    1683             :     ScoreBrackets->dump();
    1684             :   });
    1685             : 
    1686             :   // Walk over the instructions.
    1687       20480 :   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
    1688      324073 :        Iter != E;) {
    1689             :     MachineInstr &Inst = *Iter;
    1690             :     // Remove any previously existing waitcnts.
    1691      609725 :     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
    1692             :       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
    1693             :       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
    1694             :       // as needed.
    1695             :       if (!TrackedWaitcntSet.count(&Inst))
    1696             :         ++Iter;
    1697             :       else {
    1698             :         ++Iter;
    1699         221 :         Inst.removeFromParent();
    1700             :       }
    1701             :       ScoreBrackets->setWaitcnt(&Inst);
    1702             :       continue;
    1703             :     }
    1704             : 
    1705             :     // Kill instructions generate a conditional branch to the endmain block.
    1706             :     // Merge the current waitcnt state into the endmain block information.
    1707             :     // TODO: Are there other flavors of KILL instruction?
    1708      301054 :     if (Inst.getOpcode() == AMDGPU::KILL) {
    1709          75 :       addKillWaitBracket(ScoreBrackets);
    1710             :     }
    1711             : 
    1712             :     bool VCCZBugWorkAround = false;
    1713             :     if (readsVCCZ(Inst) &&
    1714             :         (!VCCZBugHandledSet.count(&Inst))) {
    1715             :       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1716         304 :               ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1717          28 :           ScoreBrackets->hasPendingSMEM()) {
    1718           5 :         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
    1719             :           VCCZBugWorkAround = true;
    1720             :       }
    1721             :     }
    1722             : 
    1723             :     // Generate an s_waitcnt instruction to be placed before
    1724             :     // cur_Inst, if needed.
    1725      301054 :     generateWaitcntInstBefore(Inst, ScoreBrackets);
    1726             : 
    1727      301054 :     updateEventWaitcntAfter(Inst, ScoreBrackets);
    1728             : 
    1729             : #if 0 // TODO: implement resource type check controlled by options with ub = LB.
    1730             :     // If this instruction generates a S_SETVSKIP because it is an
    1731             :     // indexed resource, and we are on Tahiti, then it will also force
    1732             :     // an S_WAITCNT vmcnt(0)
    1733             :     if (RequireCheckResourceType(Inst, context)) {
    1734             :       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
    1735             :       ScoreBrackets->setScoreLB(VM_CNT,
    1736             :       ScoreBrackets->getScoreUB(VM_CNT));
    1737             :     }
    1738             : #endif
    1739             : 
    1740             :     ScoreBrackets->clearWaitcnt();
    1741             : 
    1742             :     LLVM_DEBUG({
    1743             :       Inst.print(dbgs());
    1744             :       ScoreBrackets->dump();
    1745             :     });
    1746             : 
    1747             :     // Check to see if this is a GWS instruction. If so, and if this is CI or
    1748             :     // VI, then the generated code sequence will include an S_WAITCNT 0.
    1749             :     // TODO: Are these the only GWS instructions?
    1750      602108 :     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
    1751      301054 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
    1752      301054 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
    1753      602108 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
    1754             :         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
    1755             :       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
    1756           0 :       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1757           0 :       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1758           0 :       ScoreBrackets->updateByWait(LGKM_CNT,
    1759             :                                   ScoreBrackets->getScoreUB(LGKM_CNT));
    1760             :     }
    1761             : 
    1762             :     // TODO: Remove this work-around after fixing the scheduler and enable the
    1763             :     // assert above.
    1764      301054 :     if (VCCZBugWorkAround) {
    1765             :       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    1766             :       // bit is updated, so we can restore the bit by reading the value of
    1767             :       // vcc and then writing it back to the register.
    1768          12 :       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    1769           4 :               AMDGPU::VCC)
    1770           4 :           .addReg(AMDGPU::VCC);
    1771           8 :       VCCZBugHandledSet.insert(&Inst);
    1772             :     }
    1773             : 
    1774             :     ++Iter;
    1775             :   }
    1776             : 
    1777             :   // Check if we need to force convergence at loop footer.
    1778       40960 :   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
    1779       20480 :   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
    1780         487 :     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1781             :     WaitcntData->print();
    1782             :     LLVM_DEBUG(dbgs() << '\n';);
    1783             : 
    1784             :     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
    1785             :     // placement, but doesn't guarantee convergence for a loop. Each
    1786             :     // loop should take at most (n+1) iterations for it to converge naturally,
    1787             :     // where n is the number of bottom blocks. If this threshold is reached and
    1788             :     // the result hasn't converged, then we force convergence by inserting
    1789             :     // a s_waitcnt at the end of loop footer.
    1790         974 :     if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
    1791             :       // To ensure convergence, need to make wait events at loop footer be no
    1792             :       // more than those from the previous iteration.
    1793             :       // As a simplification, instead of tracking individual scores and
    1794             :       // generating the precise wait count, just wait on 0.
    1795             :       bool HasPending = false;
    1796           0 :       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
    1797           0 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1798           0 :            T = (enum InstCounterType)(T + 1)) {
    1799           0 :         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    1800             :           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1801             :           HasPending = true;
    1802             :           break;
    1803             :         }
    1804             :       }
    1805             : 
    1806             :       if (HasPending) {
    1807           0 :         if (!SWaitInst) {
    1808           0 :           SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
    1809           0 :                               DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1810             :                               .addImm(0);
    1811             :           TrackedWaitcntSet.insert(SWaitInst);
    1812             : #if 0 // TODO: Format the debug output
    1813             :           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
    1814             :           OutputTransformAdd(SWaitInst, context);
    1815             : #endif
    1816             :         }
    1817             : #if 0 // TODO: ??
    1818             :         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
    1819             : #endif
    1820             :       }
    1821             : 
    1822           0 :       if (SWaitInst) {
    1823             :         LLVM_DEBUG({
    1824             :           SWaitInst->print(dbgs());
    1825             :           dbgs() << "\nAdjusted score board:";
    1826             :           ScoreBrackets->dump();
    1827             :         });
    1828             : 
    1829             :         // Add this waitcnt to the block. It is either newly created or
    1830             :         // created in previous iterations and added back since block traversal
    1831             :         // always removes waitcnts.
    1832           0 :         insertWaitcntBeforeCF(Block, SWaitInst);
    1833           0 :         WaitcntData->setWaitcnt(SWaitInst);
    1834             :       }
    1835             :     }
    1836             :   }
    1837       20480 : }
    1838             : 
    1839       17896 : bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    1840       17896 :   ST = &MF.getSubtarget<SISubtarget>();
    1841       17896 :   TII = ST->getInstrInfo();
    1842       17896 :   TRI = &TII->getRegisterInfo();
    1843       17896 :   MRI = &MF.getRegInfo();
    1844       17896 :   MLI = &getAnalysis<MachineLoopInfo>();
    1845       35792 :   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
    1846       17896 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1847       35792 :   AMDGPUASI = ST->getAMDGPUAS();
    1848             : 
    1849       17896 :   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
    1850      125272 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1851       53688 :        T = (enum InstCounterType)(T + 1))
    1852       53688 :     ForceEmitWaitcnt[T] = false;
    1853             : 
    1854       17896 :   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
    1855       17896 :   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
    1856       17896 :   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
    1857             : 
    1858       35792 :   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
    1859       35792 :   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
    1860             :   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
    1861             :   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
    1862             : 
    1863       35792 :   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
    1864       17896 :   RegisterEncoding.VGPRL =
    1865       17896 :       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
    1866       17896 :   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
    1867       17896 :   RegisterEncoding.SGPRL =
    1868       17896 :       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
    1869             : 
    1870             :   TrackedWaitcntSet.clear();
    1871             :   BlockVisitedSet.clear();
    1872             :   VCCZBugHandledSet.clear();
    1873       17896 :   LoopWaitcntDataMap.clear();
    1874       17896 :   BlockWaitcntProcessedSet.clear();
    1875             : 
    1876             :   // Walk over the blocks in reverse post-dominator order, inserting
    1877             :   // s_waitcnt where needed.
    1878             :   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    1879             :   bool Modified = false;
    1880             :   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1881             :            I = RPOT.begin(),
    1882             :            E = RPOT.end(), J = RPOT.begin();
    1883       38376 :        I != E;) {
    1884       20480 :     MachineBasicBlock &MBB = **I;
    1885             : 
    1886       40960 :     BlockVisitedSet.insert(&MBB);
    1887             : 
    1888       40960 :     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1889       20480 :     if (!ScoreBrackets) {
    1890       22404 :       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
    1891       14936 :       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1892             :     }
    1893       20480 :     ScoreBrackets->setPostOrder(MBB.getNumber());
    1894       40960 :     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
    1895       21293 :     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
    1896             :       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
    1897             : 
    1898             :     // If we are walking into the block from before the loop, then guarantee
    1899             :     // at least 1 re-walk over the loop to propagate the information, even if
    1900             :     // no S_WAITCNT instructions were generated.
    1901       21293 :     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
    1902         458 :       unsigned Count = countNumBottomBlocks(ContainingLoop);
    1903             : 
    1904             :       // If the loop has multiple back-edges, and so more than one "bottom"
    1905             :       // basic block, we have to guarantee a re-walk over every blocks.
    1906         458 :       if ((std::count(BlockWaitcntProcessedSet.begin(),
    1907         458 :                       BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
    1908         456 :         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
    1909             :         LLVM_DEBUG(dbgs() << "set-revisit1: Block"
    1910             :                           << ContainingLoop->getHeader()->getNumber() << '\n';);
    1911             :       }
    1912             :     }
    1913             : 
    1914             :     // Walk over the instructions.
    1915       20480 :     insertWaitcntInBlock(MF, MBB);
    1916             : 
    1917             :     // Record that waitcnts have been processed at least once for this block.
    1918       40960 :     BlockWaitcntProcessedSet.push_back(&MBB);
    1919             : 
    1920             :     // See if we want to revisit the loop. If a loop has multiple back-edges,
    1921             :     // we shouldn't revisit the same "bottom" basic block.
    1922       20967 :     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
    1923             :         std::count(BlockWaitcntProcessedSet.begin(),
    1924             :                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
    1925         456 :       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
    1926             :       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
    1927         228 :       if (EntrySB && EntrySB->getRevisitLoop()) {
    1928             :         EntrySB->setRevisitLoop(false);
    1929             :         J = I;
    1930         228 :         int32_t PostOrder = EntrySB->getPostOrder();
    1931             :         // TODO: Avoid this loop. Find another way to set I.
    1932             :         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1933             :                  X = RPOT.begin(),
    1934             :                  Y = RPOT.end();
    1935         564 :              X != Y; ++X) {
    1936         564 :           MachineBasicBlock &MBBX = **X;
    1937         564 :           if (MBBX.getNumber() == PostOrder) {
    1938             :             I = X;
    1939             :             break;
    1940             :           }
    1941             :         }
    1942             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1943             :         WaitcntData->incIterCnt();
    1944             :         LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
    1945         228 :         continue;
    1946             :       } else {
    1947             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1948             :         // Loop converged, reset iteration count. If this loop gets revisited,
    1949             :         // it must be from an outer loop, the counter will restart, this will
    1950             :         // ensure we don't force convergence on such revisits.
    1951             :         WaitcntData->resetIterCnt();
    1952             :       }
    1953             :     }
    1954             : 
    1955             :     J = I;
    1956             :     ++I;
    1957             :   }
    1958             : 
    1959             :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
    1960             : 
    1961             :   bool HaveScalarStores = false;
    1962             : 
    1963       37944 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
    1964             :        ++BI) {
    1965             :     MachineBasicBlock &MBB = *BI;
    1966             : 
    1967      369793 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
    1968             :          ++I) {
    1969      659140 :       if (!HaveScalarStores && TII->isScalarStore(*I))
    1970             :         HaveScalarStores = true;
    1971             : 
    1972      659394 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
    1973             :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
    1974       16482 :         EndPgmBlocks.push_back(&MBB);
    1975             :     }
    1976             :   }
    1977             : 
    1978       17896 :   if (HaveScalarStores) {
    1979             :     // If scalar writes are used, the cache must be flushed or else the next
    1980             :     // wave to reuse the same scratch memory can be clobbered.
    1981             :     //
    1982             :     // Insert s_dcache_wb at wave termination points if there were any scalar
    1983             :     // stores, and only if the cache hasn't already been flushed. This could be
    1984             :     // improved by looking across blocks for flushes in postdominating blocks
    1985             :     // from the stores but an explicitly requested flush is probably very rare.
    1986          46 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
    1987             :       bool SeenDCacheWB = false;
    1988             : 
    1989         164 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
    1990             :            ++I) {
    1991         264 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
    1992             :           SeenDCacheWB = true;
    1993         130 :         else if (TII->isScalarStore(*I))
    1994             :           SeenDCacheWB = false;
    1995             : 
    1996             :         // FIXME: It would be better to insert this before a waitcnt if any.
    1997         117 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
    1998         148 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
    1999             :             !SeenDCacheWB) {
    2000             :           Modified = true;
    2001          30 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
    2002             :         }
    2003             :       }
    2004             :     }
    2005             :   }
    2006             : 
    2007       17896 :   if (!MFI->isEntryFunction()) {
    2008             :     // Wait for any outstanding memory operations that the input registers may
    2009             :     // depend on. We can't track them and it's better to the wait after the
    2010             :     // costly call sequence.
    2011             : 
    2012             :     // TODO: Could insert earlier and schedule more liberally with operations
    2013             :     // that only use caller preserved registers.
    2014             :     MachineBasicBlock &EntryBB = MF.front();
    2015        5688 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    2016             :       .addImm(0);
    2017             : 
    2018             :     Modified = true;
    2019             :   }
    2020             : 
    2021       17896 :   return Modified;
    2022      299229 : }

Generated by: LCOV version 1.13