LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaitcnts.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 547 636 86.0 %
Date: 2018-06-17 00:07:59 Functions: 27 29 93.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/DenseMap.h"
      27             : #include "llvm/ADT/DenseSet.h"
      28             : #include "llvm/ADT/PostOrderIterator.h"
      29             : #include "llvm/ADT/STLExtras.h"
      30             : #include "llvm/ADT/SmallVector.h"
      31             : #include "llvm/CodeGen/MachineBasicBlock.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineFunctionPass.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineLoopInfo.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineOperand.h"
      39             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      40             : #include "llvm/IR/DebugLoc.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Debug.h"
      43             : #include "llvm/Support/DebugCounter.h"
      44             : #include "llvm/Support/ErrorHandling.h"
      45             : #include "llvm/Support/raw_ostream.h"
      46             : #include <algorithm>
      47             : #include <cassert>
      48             : #include <cstdint>
      49             : #include <cstring>
      50             : #include <memory>
      51             : #include <utility>
      52             : #include <vector>
      53             : 
      54             : using namespace llvm;
      55             : 
      56             : #define DEBUG_TYPE "si-insert-waitcnts"
      57             : 
      58      101169 : DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
      59             :               "Force emit s_waitcnt expcnt(0) instrs");
      60      101169 : DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
      61             :               "Force emit s_waitcnt lgkmcnt(0) instrs");
      62      101169 : DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
      63             :               "Force emit s_waitcnt vmcnt(0) instrs");
      64             : 
      65      101169 : static cl::opt<unsigned> ForceEmitZeroFlag(
      66             :   "amdgpu-waitcnt-forcezero",
      67      101169 :   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
      68      303507 :   cl::init(0), cl::Hidden);
      69             : 
      70             : namespace {
      71             : 
      72             : // Class of object that encapsulates latest instruction counter score
      73             : // associated with the operand.  Used for determining whether
      74             : // s_waitcnt instruction needs to be emited.
      75             : 
      76             : #define CNT_MASK(t) (1u << (t))
      77             : 
      78             : enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
      79             : 
      80             : using RegInterval = std::pair<signed, signed>;
      81             : 
      82             : struct {
      83             :   int32_t VmcntMax;
      84             :   int32_t ExpcntMax;
      85             :   int32_t LgkmcntMax;
      86             :   int32_t NumVGPRsMax;
      87             :   int32_t NumSGPRsMax;
      88             : } HardwareLimits;
      89             : 
      90             : struct {
      91             :   unsigned VGPR0;
      92             :   unsigned VGPRL;
      93             :   unsigned SGPR0;
      94             :   unsigned SGPRL;
      95             : } RegisterEncoding;
      96             : 
      97             : enum WaitEventType {
      98             :   VMEM_ACCESS,      // vector-memory read & write
      99             :   LDS_ACCESS,       // lds read & write
     100             :   GDS_ACCESS,       // gds read & write
     101             :   SQ_MESSAGE,       // send message
     102             :   SMEM_ACCESS,      // scalar-memory read & write
     103             :   EXP_GPR_LOCK,     // export holding on its data src
     104             :   GDS_GPR_LOCK,     // GDS holding on its data and addr src
     105             :   EXP_POS_ACCESS,   // write to export position
     106             :   EXP_PARAM_ACCESS, // write to export parameter
     107             :   VMW_GPR_LOCK,     // vector-memory write holding on its data src
     108             :   NUM_WAIT_EVENTS,
     109             : };
     110             : 
     111             : // The mapping is:
     112             : //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
     113             : //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
     114             : //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
     115             : // We reserve a fixed number of VGPR slots in the scoring tables for
     116             : // special tokens like SCMEM_LDS (needed for buffer load to LDS).
     117             : enum RegisterMapping {
     118             :   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
     119             :   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
     120             :   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
     121             :   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
     122             :   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
     123             : };
     124             : 
     125             : #define ForAllWaitEventType(w)                                                 \
     126             :   for (enum WaitEventType w = (enum WaitEventType)0;                           \
     127             :        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
     128             :        (w) = (enum WaitEventType)((w) + 1))
     129             : 
     130             : // This is a per-basic-block object that maintains current score brackets
     131             : // of each wait counter, and a per-register scoreboard for each wait counter.
     132             : // We also maintain the latest score for every event type that can change the
     133             : // waitcnt in order to know if there are multiple types of events within
     134             : // the brackets. When multiple types of event happen in the bracket,
     135             : // wait count may get decreased out of order, therefore we need to put in
     136             : // "s_waitcnt 0" before use.
     137             : class BlockWaitcntBrackets {
     138             : public:
     139        7741 :   BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
     140       54187 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     141       23223 :          T = (enum InstCounterType)(T + 1)) {
     142       23223 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     143             :     }
     144        7741 :   }
     145             : 
     146             :   ~BlockWaitcntBrackets() = default;
     147             : 
     148             :   static int32_t getWaitCountMax(InstCounterType T) {
     149       38575 :     switch (T) {
     150       14416 :     case VM_CNT:
     151       14416 :       return HardwareLimits.VmcntMax;
     152       21927 :     case LGKM_CNT:
     153       21927 :       return HardwareLimits.LgkmcntMax;
     154        2232 :     case EXP_CNT:
     155       13072 :       return HardwareLimits.ExpcntMax;
     156             :     default:
     157             :       break;
     158             :     }
     159             :     return 0;
     160             :   }
     161             : 
     162             :   void setScoreLB(InstCounterType T, int32_t Val) {
     163             :     assert(T < NUM_INST_CNTS);
     164       39988 :     if (T >= NUM_INST_CNTS)
     165             :       return;
     166      102599 :     ScoreLBs[T] = Val;
     167             :   }
     168             : 
     169             :   void setScoreUB(InstCounterType T, int32_t Val) {
     170             :     assert(T < NUM_INST_CNTS);
     171             :     if (T >= NUM_INST_CNTS)
     172             :       return;
     173      172174 :     ScoreUBs[T] = Val;
     174      110983 :     if (T == EXP_CNT) {
     175       10840 :       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
     176       10840 :       if (ScoreLBs[T] < UB)
     177        1087 :         ScoreLBs[T] = UB;
     178             :     }
     179             :   }
     180             : 
     181             :   int32_t getScoreLB(InstCounterType T) {
     182             :     assert(T < NUM_INST_CNTS);
     183     2727489 :     if (T >= NUM_INST_CNTS)
     184             :       return 0;
     185     2813427 :     return ScoreLBs[T];
     186             :   }
     187             : 
     188             :   int32_t getScoreUB(InstCounterType T) {
     189             :     assert(T < NUM_INST_CNTS);
     190     2785252 :     if (T >= NUM_INST_CNTS)
     191             :       return 0;
     192     2865951 :     return ScoreUBs[T];
     193             :   }
     194             : 
     195             :   // Mapping from event to counter.
     196             :   InstCounterType eventCounter(WaitEventType E) {
     197      164203 :     switch (E) {
     198             :     case VMEM_ACCESS:
     199             :       return VM_CNT;
     200       72515 :     case LDS_ACCESS:
     201             :     case GDS_ACCESS:
     202             :     case SQ_MESSAGE:
     203             :     case SMEM_ACCESS:
     204             :       return LGKM_CNT;
     205       37450 :     case EXP_GPR_LOCK:
     206             :     case GDS_GPR_LOCK:
     207             :     case VMW_GPR_LOCK:
     208             :     case EXP_POS_ACCESS:
     209             :     case EXP_PARAM_ACCESS:
     210             :       return EXP_CNT;
     211           0 :     default:
     212           0 :       llvm_unreachable("unhandled event type");
     213             :     }
     214             :     return NUM_INST_CNTS;
     215             :   }
     216             : 
     217             :   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
     218      147891 :     if (GprNo < NUM_ALL_VGPRS) {
     219       85169 :       if (GprNo > VgprUB) {
     220       16685 :         VgprUB = GprNo;
     221             :       }
     222       85169 :       VgprScores[T][GprNo] = Val;
     223             :     } else {
     224             :       assert(T == LGKM_CNT);
     225       67572 :       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
     226       16405 :         SgprUB = GprNo - NUM_ALL_VGPRS;
     227             :       }
     228       67572 :       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     229             :     }
     230             :   }
     231             : 
     232             :   int32_t getRegScore(int GprNo, InstCounterType T) {
     233     2728307 :     if (GprNo < NUM_ALL_VGPRS) {
     234     1518090 :       return VgprScores[T][GprNo];
     235             :     }
     236     1235337 :     return SgprScores[GprNo - NUM_ALL_VGPRS];
     237             :   }
     238             : 
     239       20397 :   void clear() {
     240       20397 :     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     241       20397 :     memset(ScoreUBs, 0, sizeof(ScoreUBs));
     242       20397 :     memset(EventUBs, 0, sizeof(EventUBs));
     243      142779 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     244       61191 :          T = (enum InstCounterType)(T + 1)) {
     245       61191 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     246             :     }
     247       20397 :     memset(SgprScores, 0, sizeof(SgprScores));
     248       20397 :   }
     249             : 
     250             :   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
     251             :                              const MachineRegisterInfo *MRI,
     252             :                              const SIRegisterInfo *TRI, unsigned OpNo,
     253             :                              bool Def) const;
     254             : 
     255             :   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
     256             :                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
     257             :                    unsigned OpNo, int32_t Val);
     258             : 
     259             :   void setWaitAtBeginning() { WaitAtBeginning = true; }
     260           0 :   void clearWaitAtBeginning() { WaitAtBeginning = false; }
     261             :   bool getWaitAtBeginning() const { return WaitAtBeginning; }
     262        2448 :   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
     263             :   int32_t getMaxVGPR() const { return VgprUB; }
     264             :   int32_t getMaxSGPR() const { return SgprUB; }
     265             : 
     266             :   int32_t getEventUB(enum WaitEventType W) const {
     267             :     assert(W < NUM_WAIT_EVENTS);
     268       82706 :     return EventUBs[W];
     269             :   }
     270             : 
     271             :   bool counterOutOfOrder(InstCounterType T);
     272             :   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
     273             :   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
     274             :                      const MachineRegisterInfo *MRI, WaitEventType E,
     275             :                      MachineInstr &MI);
     276             : 
     277             :   bool hasPendingSMEM() const {
     278          59 :     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     279             :             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
     280             :   }
     281             : 
     282             :   bool hasPendingFlat() const {
     283       38101 :     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
     284       72488 :              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
     285       35639 :             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
     286          14 :              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
     287             :   }
     288             : 
     289             :   void setPendingFlat() {
     290        1616 :     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
     291        1616 :     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
     292             :   }
     293             : 
     294       15966 :   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
     295             : 
     296       61191 :   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
     297             : 
     298             :   bool getRevisitLoop() const { return RevisitLoop; }
     299         634 :   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
     300             : 
     301       20397 :   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
     302             :   int32_t getPostOrder() const { return PostOrder; }
     303             : 
     304        2516 :   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
     305      302803 :   void clearWaitcnt() { Waitcnt = nullptr; }
     306             :   MachineInstr *getWaitcnt() const { return Waitcnt; }
     307             : 
     308             :   bool mixedExpTypes() const { return MixedExpTypes; }
     309             :   void setMixedExpTypes(bool MixedExpTypesIn) {
     310       22493 :     MixedExpTypes = MixedExpTypesIn;
     311             :   }
     312             : 
     313             :   void print(raw_ostream &);
     314             :   void dump() { print(dbgs()); }
     315             : 
     316             : private:
     317             :   const SISubtarget *ST = nullptr;
     318             :   bool WaitAtBeginning = false;
     319             :   bool RevisitLoop = false;
     320             :   bool MixedExpTypes = false;
     321             :   int32_t PostOrder = 0;
     322             :   MachineInstr *Waitcnt = nullptr;
     323             :   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
     324             :   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
     325             :   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
     326             :   // Remember the last flat memory operation.
     327             :   int32_t LastFlat[NUM_INST_CNTS] = {0};
     328             :   // wait_cnt scores for every vgpr.
     329             :   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
     330             :   int32_t VgprUB = 0;
     331             :   int32_t SgprUB = 0;
     332             :   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
     333             :   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
     334             :   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
     335             : };
     336             : 
     337             : // This is a per-loop-region object that records waitcnt status at the end of
     338             : // loop footer from the previous iteration. We also maintain an iteration
     339             : // count to track the number of times the loop has been visited. When it
     340             : // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
     341             : // at the end of the loop footer.
     342             : class LoopWaitcntData {
     343             : public:
     344             :   LoopWaitcntData() = default;
     345             :   ~LoopWaitcntData() = default;
     346             : 
     347         205 :   void incIterCnt() { IterCnt++; }
     348           2 :   void resetIterCnt() { IterCnt = 0; }
     349         451 :   unsigned getIterCnt() { return IterCnt; }
     350             : 
     351           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
     352             :   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
     353             : 
     354             :   void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
     355             : 
     356             : private:
     357             :   // s_waitcnt added at the end of loop footer to stablize wait scores
     358             :   // at the end of the loop footer.
     359             :   MachineInstr *LfWaitcnt = nullptr;
     360             :   // Number of iterations the loop has been visited, not including the initial
     361             :   // walk over.
     362             :   int32_t IterCnt = 0;
     363             : };
     364             : 
     365        7188 : class SIInsertWaitcnts : public MachineFunctionPass {
     366             : private:
     367             :   const SISubtarget *ST = nullptr;
     368             :   const SIInstrInfo *TII = nullptr;
     369             :   const SIRegisterInfo *TRI = nullptr;
     370             :   const MachineRegisterInfo *MRI = nullptr;
     371             :   const MachineLoopInfo *MLI = nullptr;
     372             :   AMDGPU::IsaInfo::IsaVersion IV;
     373             :   AMDGPUAS AMDGPUASI;
     374             : 
     375             :   DenseSet<MachineBasicBlock *> BlockVisitedSet;
     376             :   DenseSet<MachineInstr *> TrackedWaitcntSet;
     377             :   DenseSet<MachineInstr *> VCCZBugHandledSet;
     378             : 
     379             :   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
     380             :       BlockWaitcntBracketsMap;
     381             : 
     382             :   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
     383             : 
     384             :   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
     385             : 
     386             :   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
     387             : 
     388             :   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
     389             :   // because of amdgpu-waitcnt-forcezero flag
     390             :   bool ForceEmitZeroWaitcnts;
     391             :   bool ForceEmitWaitcnt[NUM_INST_CNTS];
     392             : 
     393             : public:
     394             :   static char ID;
     395             : 
     396        7220 :   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
     397             : 
     398             :   bool runOnMachineFunction(MachineFunction &MF) override;
     399             : 
     400        1796 :   StringRef getPassName() const override {
     401        1796 :     return "SI insert wait instructions";
     402             :   }
     403             : 
     404        1796 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     405        1796 :     AU.setPreservesCFG();
     406             :     AU.addRequired<MachineLoopInfo>();
     407        1796 :     MachineFunctionPass::getAnalysisUsage(AU);
     408        1796 :   }
     409             : 
     410         199 :   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     411             :     // The waitcnt information is copied because it changes as the block is
     412             :     // traversed.
     413         199 :     KillWaitBrackets.push_back(
     414         398 :         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
     415         199 :   }
     416             : 
     417             :   bool isForceEmitWaitcnt() const {
     418     1211212 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     419      908409 :          T = (enum InstCounterType)(T + 1))
     420      908409 :       if (ForceEmitWaitcnt[T])
     421             :         return true;
     422             :     return false;
     423             :   }
     424             : 
     425             :   void setForceEmitWaitcnt() {
     426             : // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
     427             : // For debug builds, get the debug counter info and adjust if need be
     428             : #ifndef NDEBUG
     429             :     if (DebugCounter::isCounterSet(ForceExpCounter) &&
     430             :         DebugCounter::shouldExecute(ForceExpCounter)) {
     431             :       ForceEmitWaitcnt[EXP_CNT] = true;
     432             :     } else {
     433             :       ForceEmitWaitcnt[EXP_CNT] = false;
     434             :     }
     435             : 
     436             :     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
     437             :          DebugCounter::shouldExecute(ForceLgkmCounter)) {
     438             :       ForceEmitWaitcnt[LGKM_CNT] = true;
     439             :     } else {
     440             :       ForceEmitWaitcnt[LGKM_CNT] = false;
     441             :     }
     442             : 
     443             :     if (DebugCounter::isCounterSet(ForceVMCounter) &&
     444             :         DebugCounter::shouldExecute(ForceVMCounter)) {
     445             :       ForceEmitWaitcnt[VM_CNT] = true;
     446             :     } else {
     447             :       ForceEmitWaitcnt[VM_CNT] = false;
     448             :     }
     449             : #endif // NDEBUG
     450             :   }
     451             : 
     452             :   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
     453             :   void generateWaitcntInstBefore(MachineInstr &MI,
     454             :                                   BlockWaitcntBrackets *ScoreBrackets);
     455             :   void updateEventWaitcntAfter(MachineInstr &Inst,
     456             :                                BlockWaitcntBrackets *ScoreBrackets);
     457             :   void mergeInputScoreBrackets(MachineBasicBlock &Block);
     458             :   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
     459             :   unsigned countNumBottomBlocks(const MachineLoop *Loop);
     460             :   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
     461             :   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
     462             :   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
     463             :   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
     464             : };
     465             : 
     466             : } // end anonymous namespace
     467             : 
     468     3333307 : RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
     469             :                                                  const SIInstrInfo *TII,
     470             :                                                  const MachineRegisterInfo *MRI,
     471             :                                                  const SIRegisterInfo *TRI,
     472             :                                                  unsigned OpNo,
     473             :                                                  bool Def) const {
     474     3333307 :   const MachineOperand &Op = MI->getOperand(OpNo);
     475     5471389 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
     476     1200169 :       (Def && !Op.isDef()))
     477     2106370 :     return {-1, -1};
     478             : 
     479             :   // A use via a PW operand does not need a waitcnt.
     480             :   // A partial write is not a WAW.
     481             :   assert(!Op.getSubReg() || !Op.isUndef());
     482             : 
     483             :   RegInterval Result;
     484             :   const MachineRegisterInfo &MRIA = *MRI;
     485             : 
     486     2453874 :   unsigned Reg = TRI->getEncodingValue(Op.getReg());
     487             : 
     488     1226937 :   if (TRI->isVGPR(MRIA, Op.getReg())) {
     489             :     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     490      522247 :     Result.first = Reg - RegisterEncoding.VGPR0;
     491             :     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     492      704690 :   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
     493             :     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     494      704690 :     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     495             :     assert(Result.first >= NUM_ALL_VGPRS &&
     496             :            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
     497             :   }
     498             :   // TODO: Handle TTMP
     499             :   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
     500             :   else
     501           0 :     return {-1, -1};
     502             : 
     503             :   const MachineInstr &MIA = *MI;
     504     1226937 :   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
     505             :   unsigned Size = TRI->getRegSizeInBits(*RC);
     506     1226937 :   Result.second = Result.first + (Size / 32);
     507             : 
     508     1226937 :   return Result;
     509             : }
     510             : 
     511       12199 : void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
     512             :                                        const SIInstrInfo *TII,
     513             :                                        const SIRegisterInfo *TRI,
     514             :                                        const MachineRegisterInfo *MRI,
     515             :                                        unsigned OpNo, int32_t Val) {
     516       12199 :   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
     517             :   LLVM_DEBUG({
     518             :     const MachineOperand &Opnd = MI->getOperand(OpNo);
     519             :     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
     520             :   });
     521       32778 :   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     522             :     setRegScore(RegNo, EXP_CNT, Val);
     523             :   }
     524       12199 : }
     525             : 
     526      110983 : void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     527             :                                          const SIRegisterInfo *TRI,
     528             :                                          const MachineRegisterInfo *MRI,
     529             :                                          WaitEventType E, MachineInstr &Inst) {
     530             :   const MachineRegisterInfo &MRIA = *MRI;
     531             :   InstCounterType T = eventCounter(E);
     532      110983 :   int32_t CurrScore = getScoreUB(T) + 1;
     533             :   // EventUB and ScoreUB need to be update regardless if this event changes
     534             :   // the score of a register or not.
     535             :   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
     536      110983 :   EventUBs[E] = CurrScore;
     537             :   setScoreUB(T, CurrScore);
     538             : 
     539      110983 :   if (T == EXP_CNT) {
     540             :     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
     541             :     // is required.
     542       10840 :     if (!MixedExpTypes) {
     543       10840 :       MixedExpTypes = counterOutOfOrder(EXP_CNT);
     544             :     }
     545             : 
     546             :     // Put score on the source vgprs. If this is a store, just use those
     547             :     // specific register(s).
     548       10840 :     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
     549             :       // All GDS operations must protect their address register (same as
     550             :       // export.)
     551           0 :       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
     552             :           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
     553           0 :         setExpScore(
     554             :             &Inst, TII, TRI, MRI,
     555           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
     556             :             CurrScore);
     557             :       }
     558           0 :       if (Inst.mayStore()) {
     559           0 :         setExpScore(
     560             :             &Inst, TII, TRI, MRI,
     561           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
     562             :             CurrScore);
     563           0 :         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     564             :                                        AMDGPU::OpName::data1) != -1) {
     565           0 :           setExpScore(&Inst, TII, TRI, MRI,
     566             :                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     567             :                                                  AMDGPU::OpName::data1),
     568             :                       CurrScore);
     569             :         }
     570           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
     571           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
     572           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
     573           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
     574           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
     575           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
     576           0 :                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
     577           0 :                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
     578             :                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
     579           0 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     580           0 :           const MachineOperand &Op = Inst.getOperand(I);
     581           0 :           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
     582           0 :             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     583             :           }
     584             :         }
     585             :       }
     586       10840 :     } else if (TII->isFLAT(Inst)) {
     587           0 :       if (Inst.mayStore()) {
     588           0 :         setExpScore(
     589             :             &Inst, TII, TRI, MRI,
     590           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     591             :             CurrScore);
     592           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     593           0 :         setExpScore(
     594             :             &Inst, TII, TRI, MRI,
     595           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     596             :             CurrScore);
     597             :       }
     598       10840 :     } else if (TII->isMIMG(Inst)) {
     599          47 :       if (Inst.mayStore()) {
     600          47 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     601           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     602           0 :         setExpScore(
     603             :             &Inst, TII, TRI, MRI,
     604           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     605             :             CurrScore);
     606             :       }
     607       10793 :     } else if (TII->isMTBUF(Inst)) {
     608          24 :       if (Inst.mayStore()) {
     609          24 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     610             :       }
     611       10769 :     } else if (TII->isMUBUF(Inst)) {
     612       10316 :       if (Inst.mayStore()) {
     613       10316 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     614           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     615           0 :         setExpScore(
     616             :             &Inst, TII, TRI, MRI,
     617           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     618             :             CurrScore);
     619             :       }
     620             :     } else {
     621         453 :       if (TII->isEXP(Inst)) {
     622             :         // For export the destination registers are really temps that
     623             :         // can be used as the actual source after export patching, so
     624             :         // we need to treat them like sources and set the EXP_CNT
     625             :         // score.
     626        4530 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     627        4077 :           MachineOperand &DefMO = Inst.getOperand(I);
     628        6342 :           if (DefMO.isReg() && DefMO.isDef() &&
     629           0 :               TRI->isVGPR(MRIA, DefMO.getReg())) {
     630           0 :             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
     631             :                         CurrScore);
     632             :           }
     633             :         }
     634             :       }
     635        4530 :       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     636        4077 :         MachineOperand &MO = Inst.getOperand(I);
     637        6342 :         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
     638        1812 :           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     639             :         }
     640             :       }
     641             :     }
     642             : #if 0 // TODO: check if this is handled by MUBUF code above.
     643             :   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
     644             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
     645             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     646             :     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     647             :     unsigned OpNo;//TODO: find the OpNo for this operand;
     648             :     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     649             :     for (signed RegNo = Interval.first; RegNo < Interval.second;
     650             :     ++RegNo) {
     651             :       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     652             :     }
     653             : #endif
     654             :   } else {
     655             :     // Match the score to the destination registers.
     656      742943 :     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     657      642800 :       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
     658      642800 :       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
     659           0 :         continue;
     660      880724 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     661             :         setRegScore(RegNo, T, CurrScore);
     662             :       }
     663             :     }
     664      100143 :     if (TII->isDS(Inst) && Inst.mayStore()) {
     665             :       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     666             :     }
     667             :   }
     668      110983 : }
     669             : 
     670             : void BlockWaitcntBrackets::print(raw_ostream &OS) {
     671             :   OS << '\n';
     672             :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     673             :        T = (enum InstCounterType)(T + 1)) {
     674             :     int LB = getScoreLB(T);
     675             :     int UB = getScoreUB(T);
     676             : 
     677             :     switch (T) {
     678             :     case VM_CNT:
     679             :       OS << "    VM_CNT(" << UB - LB << "): ";
     680             :       break;
     681             :     case LGKM_CNT:
     682             :       OS << "    LGKM_CNT(" << UB - LB << "): ";
     683             :       break;
     684             :     case EXP_CNT:
     685             :       OS << "    EXP_CNT(" << UB - LB << "): ";
     686             :       break;
     687             :     default:
     688             :       OS << "    UNKNOWN(" << UB - LB << "): ";
     689             :       break;
     690             :     }
     691             : 
     692             :     if (LB < UB) {
     693             :       // Print vgpr scores.
     694             :       for (int J = 0; J <= getMaxVGPR(); J++) {
     695             :         int RegScore = getRegScore(J, T);
     696             :         if (RegScore <= LB)
     697             :           continue;
     698             :         int RelScore = RegScore - LB - 1;
     699             :         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
     700             :           OS << RelScore << ":v" << J << " ";
     701             :         } else {
     702             :           OS << RelScore << ":ds ";
     703             :         }
     704             :       }
     705             :       // Also need to print sgpr scores for lgkm_cnt.
     706             :       if (T == LGKM_CNT) {
     707             :         for (int J = 0; J <= getMaxSGPR(); J++) {
     708             :           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
     709             :           if (RegScore <= LB)
     710             :             continue;
     711             :           int RelScore = RegScore - LB - 1;
     712             :           OS << RelScore << ":s" << J << " ";
     713             :         }
     714             :       }
     715             :     }
     716             :     OS << '\n';
     717             :   }
     718             :   OS << '\n';
     719             : }
     720             : 
     721     2674269 : unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
     722             :                                                 int ScoreToWait) {
     723             :   unsigned int NeedWait = 0;
     724     2674269 :   if (ScoreToWait == -1) {
     725             :     // The score to wait is unknown. This implies that it was not encountered
     726             :     // during the path of the CFG walk done during the current traversal but
     727             :     // may be seen on a different path. Emit an s_wait counter with a
     728             :     // conservative value of 0 for the counter.
     729           0 :     NeedWait = CNT_MASK(T);
     730             :     setScoreLB(T, getScoreUB(T));
     731             :     return NeedWait;
     732             :   }
     733             : 
     734             :   // If the score of src_operand falls within the bracket, we need an
     735             :   // s_waitcnt instruction.
     736             :   const int32_t LB = getScoreLB(T);
     737             :   const int32_t UB = getScoreUB(T);
     738     2674269 :   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     739             :     if ((T == VM_CNT || T == LGKM_CNT) &&
     740       41240 :         hasPendingFlat() &&
     741        1252 :         !ST->hasFlatLgkmVMemCountInOrder()) {
     742             :       // If there is a pending FLAT operation, and this is a VMem or LGKM
     743             :       // waitcnt and the target can report early completion, then we need
     744             :       // to force a waitcnt 0.
     745        1252 :       NeedWait = CNT_MASK(T);
     746             :       setScoreLB(T, getScoreUB(T));
     747       38736 :     } else if (counterOutOfOrder(T)) {
     748             :       // Counter can get decremented out-of-order when there
     749             :       // are multiple types event in the bracket. Also emit an s_wait counter
     750             :       // with a conservative value of 0 for the counter.
     751       16870 :       NeedWait = CNT_MASK(T);
     752             :       setScoreLB(T, getScoreUB(T));
     753             :     } else {
     754       21866 :       NeedWait = CNT_MASK(T);
     755             :       setScoreLB(T, ScoreToWait);
     756             :     }
     757             :   }
     758             : 
     759             :   return NeedWait;
     760             : }
     761             : 
     762             : // Where there are multiple types of event in the bracket of a counter,
     763             : // the decrement may go out of order.
     764       49576 : bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
     765       49576 :   switch (T) {
     766             :   case VM_CNT:
     767             :     return false;
     768       21509 :   case LGKM_CNT: {
     769       38379 :     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     770       16870 :         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     771             :       // Scalar memory read always can go out of order.
     772             :       return true;
     773             :     }
     774             :     int NumEventTypes = 0;
     775        9278 :     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     776        4639 :         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     777             :       NumEventTypes++;
     778             :     }
     779        4639 :     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     780           0 :         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     781           0 :       NumEventTypes++;
     782             :     }
     783        4639 :     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
     784           0 :         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
     785           0 :       NumEventTypes++;
     786             :     }
     787        4639 :     if (NumEventTypes <= 1) {
     788             :       return false;
     789             :     }
     790             :     break;
     791             :   }
     792       13965 :   case EXP_CNT: {
     793             :     // If there has been a mixture of export types, then a waitcnt exp(0) is
     794             :     // required.
     795       13965 :     if (MixedExpTypes)
     796             :       return true;
     797             :     int NumEventTypes = 0;
     798       14270 :     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     799         305 :         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     800             :       NumEventTypes++;
     801             :     }
     802       13965 :     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     803           0 :         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     804           0 :       NumEventTypes++;
     805             :     }
     806       27462 :     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     807       13497 :         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     808       13497 :       NumEventTypes++;
     809             :     }
     810       14095 :     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
     811         130 :         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
     812         130 :       NumEventTypes++;
     813             :     }
     814             : 
     815       14006 :     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
     816          41 :         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
     817          41 :       NumEventTypes++;
     818             :     }
     819             : 
     820       13965 :     if (NumEventTypes <= 1) {
     821             :       return false;
     822             :     }
     823             :     break;
     824             :   }
     825             :   default:
     826             :     break;
     827             :   }
     828           8 :   return true;
     829             : }
     830             : 
     831       76336 : INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     832             :                       false)
     833      357084 : INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     834             :                     false)
     835             : 
     836             : char SIInsertWaitcnts::ID = 0;
     837             : 
     838             : char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
     839             : 
     840        1795 : FunctionPass *llvm::createSIInsertWaitcntsPass() {
     841        1795 :   return new SIInsertWaitcnts();
     842             : }
     843             : 
     844             : static bool readsVCCZ(const MachineInstr &MI) {
     845      605589 :   unsigned Opc = MI.getOpcode();
     846      606165 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     847         576 :          !MI.getOperand(1).isUndef();
     848             : }
     849             : 
     850             : /// Given wait count encodings checks if LHS is stronger than RHS.
     851        1124 : bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
     852        1124 :   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
     853             :     return false;
     854        1124 :   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
     855             :     return false;
     856        1119 :   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
     857             :     return false;
     858        1119 :   return true;
     859             : }
     860             : 
     861             : /// Given wait count encodings create a new encoding which is stronger
     862             : /// or equal to both.
     863           0 : unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
     864           0 :   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
     865           0 :                             AMDGPU::decodeVmcnt(IV, RHS));
     866           0 :   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
     867           0 :                               AMDGPU::decodeLgkmcnt(IV, RHS));
     868           0 :   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
     869           0 :                              AMDGPU::decodeExpcnt(IV, RHS));
     870           0 :   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
     871             : }
     872             : 
     873             : ///  Generate s_waitcnt instruction to be placed before cur_Inst.
     874             : ///  Instructions of a given type are returned in order,
     875             : ///  but instructions of different types can complete out of order.
     876             : ///  We rely on this in-order completion
     877             : ///  and simply assign a score to the memory access instructions.
     878             : ///  We keep track of the active "score bracket" to determine
     879             : ///  if an access of a memory read requires an s_waitcnt
     880             : ///  and if so what the value of each counter is.
     881             : ///  The "score bracket" is bound by the lower bound and upper bound
     882             : ///  scores (*_score_LB and *_score_ub respectively).
     883      302803 : void SIInsertWaitcnts::generateWaitcntInstBefore(
     884             :     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
     885             :   // To emit, or not to emit - that's the question!
     886             :   // Start with an assumption that there is no need to emit.
     887             :   unsigned int EmitWaitcnt = 0;
     888             : 
     889             :   // No need to wait before phi. If a phi-move exists, then the wait should
     890             :   // has been inserted before the move. If a phi-move does not exist, then
     891             :   // wait should be inserted before the real use. The same is true for
     892             :   // sc-merge. It is not a coincident that all these cases correspond to the
     893             :   // instructions that are skipped in the assembling loop.
     894             :   bool NeedLineMapping = false; // TODO: Check on this.
     895             : 
     896             :   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
     897             :   bool ForceEmitZeroWaitcnt = false;
     898             : 
     899             :   setForceEmitWaitcnt();
     900             :   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
     901             : 
     902             :   if (MI.isDebugInstr() &&
     903             :       // TODO: any other opcode?
     904             :       !NeedLineMapping) {
     905             :     return;
     906             :   }
     907             : 
     908             :   // See if an s_waitcnt is forced at block entry, or is needed at
     909             :   // program end.
     910      302786 :   if (ScoreBrackets->getWaitAtBeginning()) {
     911             :     // Note that we have already cleared the state, so we don't need to update
     912             :     // it.
     913             :     ScoreBrackets->clearWaitAtBeginning();
     914           0 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     915           0 :          T = (enum InstCounterType)(T + 1)) {
     916           0 :       EmitWaitcnt |= CNT_MASK(T);
     917             :       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     918             :     }
     919             :   }
     920             : 
     921             :   // See if this instruction has a forced S_WAITCNT VM.
     922             :   // TODO: Handle other cases of NeedsWaitcntVmBefore()
     923      302670 :   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
     924      605455 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
     925             :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     926             :     EmitWaitcnt |=
     927        1135 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     928             :   }
     929             : 
     930             :   // All waits must be resolved at call return.
     931             :   // NOTE: this could be improved with knowledge of all call sites or
     932             :   //   with knowledge of the called routines.
     933      605572 :   if (MI.getOpcode() == AMDGPU::RETURN ||
     934      604966 :       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
     935             :       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     936       13783 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     937        5907 :          T = (enum InstCounterType)(T + 1)) {
     938        5907 :       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
     939             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     940        1384 :         EmitWaitcnt |= CNT_MASK(T);
     941             :       }
     942             :     }
     943             :   }
     944             :   // Resolve vm waits before gs-done.
     945      300803 :   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
     946      300843 :             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
     947          26 :            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
     948             :             AMDGPU::SendMsg::ID_GS_DONE)) {
     949           6 :     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
     950             :       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     951           0 :       EmitWaitcnt |= CNT_MASK(VM_CNT);
     952             :     }
     953             :   }
     954             : #if 0 // TODO: the following blocks of logic when we have fence.
     955             :   else if (MI.getOpcode() == SC_FENCE) {
     956             :     const unsigned int group_size =
     957             :       context->shader_info->GetMaxThreadGroupSize();
     958             :     // group_size == 0 means thread group size is unknown at compile time
     959             :     const bool group_is_multi_wave =
     960             :       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
     961             :     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
     962             : 
     963             :     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
     964             :       SCRegType src_type = Inst->GetSrcType(i);
     965             :       switch (src_type) {
     966             :         case SCMEM_LDS:
     967             :           if (group_is_multi_wave ||
     968             :             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
     969             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     970             :                                ScoreBrackets->getScoreUB(LGKM_CNT));
     971             :             // LDS may have to wait for VM_CNT after buffer load to LDS
     972             :             if (target_info->HasBufferLoadToLDS()) {
     973             :               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     974             :                                  ScoreBrackets->getScoreUB(VM_CNT));
     975             :             }
     976             :           }
     977             :           break;
     978             : 
     979             :         case SCMEM_GDS:
     980             :           if (group_is_multi_wave || fence_is_global) {
     981             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     982             :               ScoreBrackets->getScoreUB(EXP_CNT));
     983             :             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     984             :               ScoreBrackets->getScoreUB(LGKM_CNT));
     985             :           }
     986             :           break;
     987             : 
     988             :         case SCMEM_UAV:
     989             :         case SCMEM_TFBUF:
     990             :         case SCMEM_RING:
     991             :         case SCMEM_SCATTER:
     992             :           if (group_is_multi_wave || fence_is_global) {
     993             :             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     994             :               ScoreBrackets->getScoreUB(EXP_CNT));
     995             :             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     996             :               ScoreBrackets->getScoreUB(VM_CNT));
     997             :           }
     998             :           break;
     999             : 
    1000             :         case SCMEM_SCRATCH:
    1001             :         default:
    1002             :           break;
    1003             :       }
    1004             :     }
    1005             :   }
    1006             : #endif
    1007             : 
    1008             :   // Export & GDS instructions do not read the EXEC mask until after the export
    1009             :   // is granted (which can occur well after the instruction is issued).
    1010             :   // The shader program must flush all EXP operations on the export-count
    1011             :   // before overwriting the EXEC mask.
    1012             :   else {
    1013      601622 :     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
    1014             :       // Export and GDS are tracked individually, either may trigger a waitcnt
    1015             :       // for EXEC.
    1016        2239 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1017             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
    1018        2239 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1019             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
    1020        2239 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1021             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
    1022        2239 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1023             :           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
    1024             :     }
    1025             : 
    1026             : #if 0 // TODO: the following code to handle CALL.
    1027             :     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
    1028             :     // However, there is a problem with EXP_CNT, because the call cannot
    1029             :     // easily tell if a register is used in the function, and if it did, then
    1030             :     // the referring instruction would have to have an S_WAITCNT, which is
    1031             :     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
    1032             :     // before the call.
    1033             :     if (MI.getOpcode() == SC_CALL) {
    1034             :       if (ScoreBrackets->getScoreUB(EXP_CNT) >
    1035             :         ScoreBrackets->getScoreLB(EXP_CNT)) {
    1036             :         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1037             :         EmitWaitcnt |= CNT_MASK(EXP_CNT);
    1038             :       }
    1039             :     }
    1040             : #endif
    1041             : 
    1042             :     // FIXME: Should not be relying on memoperands.
    1043             :     // Look at the source operands of every instruction to see if
    1044             :     // any of them results from a previous memory operation that affects
    1045             :     // its current usage. If so, an s_waitcnt instruction needs to be
    1046             :     // emitted.
    1047             :     // If the source operand was defined by a load, add the s_waitcnt
    1048             :     // instruction.
    1049      488857 :     for (const MachineMemOperand *Memop : MI.memoperands()) {
    1050             :       unsigned AS = Memop->getAddrSpace();
    1051       94023 :       if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1052       82659 :         continue;
    1053             :       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1054             :       // VM_CNT is only relevant to vgpr or LDS.
    1055       11364 :       EmitWaitcnt |= ScoreBrackets->updateByWait(
    1056             :           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1057             :     }
    1058             : 
    1059     1639965 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1060     1339154 :       const MachineOperand &Op = MI.getOperand(I);
    1061     1339154 :       const MachineRegisterInfo &MRIA = *MRI;
    1062             :       RegInterval Interval =
    1063     1339154 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
    1064     2798346 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1065     1459192 :         if (TRI->isVGPR(MRIA, Op.getReg())) {
    1066             :           // VM_CNT is only relevant to vgpr or LDS.
    1067      422321 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1068             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1069             :         }
    1070     1459192 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1071             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1072             :       }
    1073             :     }
    1074             :     // End of for loop that looks at all source operands to decide vm_wait_cnt
    1075             :     // and lgk_wait_cnt.
    1076             : 
    1077             :     // Two cases are handled for destination operands:
    1078             :     // 1) If the destination operand was defined by a load, add the s_waitcnt
    1079             :     // instruction to guarantee the right WAW order.
    1080             :     // 2) If a destination operand that was used by a recent export/store ins,
    1081             :     // add s_waitcnt on exp_cnt to guarantee the WAR order.
    1082      300811 :     if (MI.mayStore()) {
    1083             :       // FIXME: Should not be relying on memoperands.
    1084      111137 :       for (const MachineMemOperand *Memop : MI.memoperands()) {
    1085             :         unsigned AS = Memop->getAddrSpace();
    1086       37116 :         if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1087       30238 :           continue;
    1088             :         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1089        6878 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1090             :             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1091        6878 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1092             :             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1093             :       }
    1094             :     }
    1095     1639965 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1096     1339154 :       MachineOperand &Def = MI.getOperand(I);
    1097     1339154 :       const MachineRegisterInfo &MRIA = *MRI;
    1098             :       RegInterval Interval =
    1099     1339154 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
    1100     1715612 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1101      376458 :         if (TRI->isVGPR(MRIA, Def.getReg())) {
    1102      190446 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1103             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1104      190446 :           EmitWaitcnt |= ScoreBrackets->updateByWait(
    1105             :               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1106             :         }
    1107      376458 :         EmitWaitcnt |= ScoreBrackets->updateByWait(
    1108             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1109             :       }
    1110             :     } // End of for loop that looks at all dest operands.
    1111             :   }
    1112             : 
    1113             :   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
    1114             :   // occurs before the instruction. Doing it here prevents any additional
    1115             :   // S_WAITCNTs from being emitted if the instruction was marked as
    1116             :   // requiring a WAITCNT beforehand.
    1117      605639 :   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
    1118          67 :       !ST->hasAutoWaitcntBeforeBarrier()) {
    1119          65 :     EmitWaitcnt |=
    1120          65 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1121          65 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1122             :         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1123          65 :     EmitWaitcnt |= ScoreBrackets->updateByWait(
    1124             :         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
    1125             :   }
    1126             : 
    1127             :   // TODO: Remove this work-around, enable the assert for Bug 457939
    1128             :   //       after fixing the scheduler. Also, the Shader Compiler code is
    1129             :   //       independent of target.
    1130         287 :   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
    1131             :     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1132         190 :             ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1133          29 :         ScoreBrackets->hasPendingSMEM()) {
    1134             :       // Wait on everything, not just LGKM.  vccz reads usually come from
    1135             :       // terminators, and we always wait on everything at the end of the
    1136             :       // block, so if we only wait on LGKM here, we might end up with
    1137             :       // another s_waitcnt inserted right after this if there are non-LGKM
    1138             :       // instructions still outstanding.
    1139             :       // FIXME: this is too conservative / the comment is wrong.
    1140             :       // We don't wait on everything at the end of the block and we combine
    1141             :       // waitcnts so we should never have back-to-back waitcnts.
    1142             :       ForceEmitZeroWaitcnt = true;
    1143             :       EmitWaitcnt = true;
    1144             :     }
    1145             :   }
    1146             : 
    1147             :   // Does this operand processing indicate s_wait counter update?
    1148      302786 :   if (EmitWaitcnt || IsForceEmitWaitcnt) {
    1149             :     int CntVal[NUM_INST_CNTS];
    1150             : 
    1151             :     bool UseDefaultWaitcntStrategy = true;
    1152       35941 :     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
    1153             :       // Force all waitcnts to 0.
    1154          56 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1155          24 :            T = (enum InstCounterType)(T + 1)) {
    1156             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1157             :       }
    1158           8 :       CntVal[VM_CNT] = 0;
    1159           8 :       CntVal[EXP_CNT] = 0;
    1160           8 :       CntVal[LGKM_CNT] = 0;
    1161             :       UseDefaultWaitcntStrategy = false;
    1162             :     }
    1163             : 
    1164             :     if (UseDefaultWaitcntStrategy) {
    1165      251531 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1166      107799 :            T = (enum InstCounterType)(T + 1)) {
    1167      107799 :         if (EmitWaitcnt & CNT_MASK(T)) {
    1168             :           int Delta =
    1169       38575 :               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
    1170             :           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
    1171       38575 :           if (Delta >= MaxDelta) {
    1172             :             Delta = -1;
    1173          12 :             if (T != EXP_CNT) {
    1174          12 :               ScoreBrackets->setScoreLB(
    1175             :                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
    1176             :             }
    1177          12 :             EmitWaitcnt &= ~CNT_MASK(T);
    1178             :           }
    1179       38575 :           CntVal[T] = Delta;
    1180             :         } else {
    1181             :           // If we are not waiting for a particular counter then encode
    1182             :           // it as -1 which means "don't care."
    1183       69224 :           CntVal[T] = -1;
    1184             :         }
    1185             :       }
    1186             :     }
    1187             : 
    1188             :     // If we are not waiting on any counter we can skip the wait altogether.
    1189       35941 :     if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
    1190       35931 :       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
    1191       35931 :       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
    1192        1321 :       if (!OldWaitcnt ||
    1193        1321 :           (AMDGPU::decodeVmcnt(IV, Imm) !=
    1194        2632 :                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
    1195        1311 :           (AMDGPU::decodeExpcnt(IV, Imm) !=
    1196       38535 :            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
    1197        1293 :           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
    1198        1293 :            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
    1199       34638 :         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
    1200         224 :         if (ContainingLoop) {
    1201         224 :           MachineBasicBlock *TBB = ContainingLoop->getHeader();
    1202             :           BlockWaitcntBrackets *ScoreBracket =
    1203         224 :               BlockWaitcntBracketsMap[TBB].get();
    1204         224 :           if (!ScoreBracket) {
    1205             :             assert(!BlockVisitedSet.count(TBB));
    1206             :             BlockWaitcntBracketsMap[TBB] =
    1207           0 :                 llvm::make_unique<BlockWaitcntBrackets>(ST);
    1208             :             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
    1209             :           }
    1210             :           ScoreBracket->setRevisitLoop(true);
    1211             :           LLVM_DEBUG(dbgs()
    1212             :                          << "set-revisit2: Block"
    1213             :                          << ContainingLoop->getHeader()->getNumber() << '\n';);
    1214             :         }
    1215             :       }
    1216             : 
    1217             :       // Update an existing waitcount, or make a new one.
    1218      215586 :       unsigned Enc = AMDGPU::encodeWaitcnt(IV,
    1219       71862 :                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
    1220       71862 :                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
    1221      107793 :                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
    1222             :       // We don't remove waitcnts that existed prior to the waitcnt
    1223             :       // pass. Check if the waitcnt to-be-inserted can be avoided
    1224             :       // or if the prev waitcnt can be updated.
    1225             :       bool insertSWaitInst = true;
    1226       35931 :       for (MachineBasicBlock::iterator I = MI.getIterator(),
    1227       35931 :                                        B = MI.getParent()->begin();
    1228       71590 :            insertSWaitInst && I != B; --I) {
    1229       70324 :         if (I == MI.getIterator())
    1230             :           continue;
    1231             : 
    1232       69330 :         switch (I->getOpcode()) {
    1233             :         case AMDGPU::S_WAITCNT:
    1234        1124 :           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
    1235             :             insertSWaitInst = false;
    1236           5 :           else if (!OldWaitcnt) {
    1237             :             OldWaitcnt = &*I;
    1238           0 :             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
    1239             :           }
    1240             :           break;
    1241             :         // TODO: skip over instructions which never require wait.
    1242             :         }
    1243             :         break;
    1244             :       }
    1245       35931 :       if (insertSWaitInst) {
    1246       35014 :         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
    1247             :           if (ForceEmitZeroWaitcnts)
    1248             :             LLVM_DEBUG(
    1249             :                 dbgs()
    1250             :                 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
    1251             :           if (IsForceEmitWaitcnt)
    1252             :             LLVM_DEBUG(dbgs()
    1253             :                        << "Force emit a s_waitcnt due to debug counter\n");
    1254             : 
    1255         202 :           OldWaitcnt->getOperand(0).setImm(Enc);
    1256         202 :           if (!OldWaitcnt->getParent())
    1257         197 :             MI.getParent()->insert(MI, OldWaitcnt);
    1258             : 
    1259             :           LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
    1260             :                             << "Old Instr: " << MI << '\n'
    1261             :                             << "New Instr: " << *OldWaitcnt << '\n');
    1262             :         } else {
    1263       34610 :             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
    1264       34610 :                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1265       34610 :                              .addImm(Enc);
    1266       69220 :             TrackedWaitcntSet.insert(SWaitInst);
    1267             : 
    1268             :             LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
    1269             :                               << "Old Instr: " << MI << '\n'
    1270             :                               << "New Instr: " << *SWaitInst << '\n');
    1271             :         }
    1272             :       }
    1273             : 
    1274       35931 :       if (CntVal[EXP_CNT] == 0) {
    1275             :         ScoreBrackets->setMixedExpTypes(false);
    1276             :       }
    1277             :     }
    1278             :   }
    1279             : }
    1280             : 
    1281           0 : void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
    1282             :                                              MachineInstr *Waitcnt) {
    1283           0 :   if (MBB.empty()) {
    1284             :     MBB.push_back(Waitcnt);
    1285           0 :     return;
    1286             :   }
    1287             : 
    1288           0 :   MachineBasicBlock::iterator It = MBB.end();
    1289             :   MachineInstr *MI = &*(--It);
    1290           0 :   if (MI->isBranch()) {
    1291             :     MBB.insert(It, Waitcnt);
    1292             :   } else {
    1293             :     MBB.push_back(Waitcnt);
    1294             :   }
    1295             : }
    1296             : 
    1297             : // This is a flat memory operation. Check to see if it has memory
    1298             : // tokens for both LDS and Memory, and if so mark it as a flat.
    1299        8937 : bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
    1300        8937 :   if (MI.memoperands_empty())
    1301             :     return true;
    1302             : 
    1303       23557 :   for (const MachineMemOperand *Memop : MI.memoperands()) {
    1304             :     unsigned AS = Memop->getAddrSpace();
    1305        8914 :     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
    1306             :       return true;
    1307             :   }
    1308             : 
    1309             :   return false;
    1310             : }
    1311             : 
    1312      302803 : void SIInsertWaitcnts::updateEventWaitcntAfter(
    1313             :     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
    1314             :   // Now look at the instruction opcode. If it is a memory access
    1315             :   // instruction, update the upper-bound of the appropriate counter's
    1316             :   // bracket and the destination operand scores.
    1317             :   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
    1318      311220 :   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    1319        8417 :     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
    1320           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
    1321           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    1322             :     } else {
    1323        8417 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1324             :     }
    1325      294386 :   } else if (TII->isFLAT(Inst)) {
    1326             :     assert(Inst.mayLoad() || Inst.mayStore());
    1327             : 
    1328       11517 :     if (TII->usesVM_CNT(Inst))
    1329       11517 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1330             : 
    1331       11517 :     if (TII->usesLGKM_CNT(Inst)) {
    1332        8937 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1333             : 
    1334             :       // This is a flat memory operation, so note it - it will require
    1335             :       // that both the VM and LGKM be flushed to zero if it is pending when
    1336             :       // a VM or LGKM dependency occurs.
    1337        8937 :       if (mayAccessLDSThroughFlat(Inst))
    1338             :         ScoreBrackets->setPendingFlat();
    1339             :     }
    1340       38534 :   } else if (SIInstrInfo::isVMEM(Inst) &&
    1341             :              // TODO: get a better carve out.
    1342       38418 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
    1343       38417 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
    1344             :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
    1345       37399 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1346       92434 :     if (ST->vmemWriteNeedsExpWaitcnt() &&
    1347       32134 :         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
    1348       10387 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    1349             :     }
    1350      245470 :   } else if (TII->isSMRD(Inst)) {
    1351       33847 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1352             :   } else {
    1353      211623 :     switch (Inst.getOpcode()) {
    1354          26 :     case AMDGPU::S_SENDMSG:
    1355             :     case AMDGPU::S_SENDMSGHALT:
    1356          26 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
    1357          26 :       break;
    1358         453 :     case AMDGPU::EXP:
    1359             :     case AMDGPU::EXP_DONE: {
    1360         453 :       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    1361         453 :       if (Imm >= 32 && Imm <= 63)
    1362         120 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    1363         333 :       else if (Imm >= 12 && Imm <= 15)
    1364          37 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    1365             :       else
    1366         296 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
    1367             :       break;
    1368             :     }
    1369           0 :     case AMDGPU::S_MEMTIME:
    1370             :     case AMDGPU::S_MEMREALTIME:
    1371           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1372           0 :       break;
    1373             :     default:
    1374             :       break;
    1375             :     }
    1376             :   }
    1377      302803 : }
    1378             : 
    1379             : // Merge the score brackets of the Block's predecessors;
    1380             : // this merged score bracket is used when adding waitcnts to the Block
    1381       20397 : void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
    1382       40794 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1383       20397 :   int32_t MaxPending[NUM_INST_CNTS] = {0};
    1384       20397 :   int32_t MaxFlat[NUM_INST_CNTS] = {0};
    1385             :   bool MixedExpTypes = false;
    1386             : 
    1387             :   // For single basic block loops, we need to retain the Block's
    1388             :   // score bracket to have accurate Pred info. So, make a copy of Block's
    1389             :   // score bracket, clear() it (which retains several important bits of info),
    1390             :   // populate, and then replace en masse. For non-single basic block loops,
    1391             :   // just clear Block's current score bracket and repopulate in-place.
    1392             :   bool IsSelfPred;
    1393             :   std::unique_ptr<BlockWaitcntBrackets> S;
    1394             : 
    1395       61191 :   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
    1396             :     != Block.pred_end();
    1397       20397 :   if (IsSelfPred) {
    1398         560 :     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1399             :     ScoreBrackets = S.get();
    1400             :   }
    1401             : 
    1402       20397 :   ScoreBrackets->clear();
    1403             : 
    1404             :   // See if there are any uninitialized predecessors. If so, emit an
    1405             :   // s_waitcnt 0 at the beginning of the block.
    1406       24208 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1407             :     BlockWaitcntBrackets *PredScoreBrackets =
    1408             :         BlockWaitcntBracketsMap[Pred].get();
    1409        3811 :     bool Visited = BlockVisitedSet.count(Pred);
    1410        3719 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1411          92 :       continue;
    1412             :     }
    1413       26033 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1414       11157 :          T = (enum InstCounterType)(T + 1)) {
    1415             :       int span =
    1416       11157 :           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
    1417       22314 :       MaxPending[T] = std::max(MaxPending[T], span);
    1418       11157 :       span =
    1419       11157 :           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
    1420       22314 :       MaxFlat[T] = std::max(MaxFlat[T], span);
    1421             :     }
    1422             : 
    1423        3719 :     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
    1424             :   }
    1425             : 
    1426             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1427             :   // Also handle kills for exit block.
    1428       38316 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1429        5105 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1430       11221 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1431        4809 :            T = (enum InstCounterType)(T + 1)) {
    1432        4809 :         int Span = KillWaitBrackets[I]->getScoreUB(T) -
    1433        4809 :                    KillWaitBrackets[I]->getScoreLB(T);
    1434        9618 :         MaxPending[T] = std::max(MaxPending[T], Span);
    1435        4809 :         Span = KillWaitBrackets[I]->pendingFlat(T) -
    1436             :                KillWaitBrackets[I]->getScoreLB(T);
    1437        9618 :         MaxFlat[T] = std::max(MaxFlat[T], Span);
    1438             :       }
    1439             : 
    1440        1603 :       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
    1441             :     }
    1442             :   }
    1443             : 
    1444             :   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
    1445       24208 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1446             :     BlockWaitcntBrackets *PredScoreBrackets =
    1447             :         BlockWaitcntBracketsMap[Pred].get();
    1448        3811 :     bool Visited = BlockVisitedSet.count(Pred);
    1449        3811 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1450          92 :       continue;
    1451             :     }
    1452             : 
    1453        3719 :     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
    1454        3719 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1455        3719 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1456        3719 :     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
    1457        3719 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1458        3719 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1459             :   }
    1460             : 
    1461             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1462       38316 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1463        5105 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1464        1603 :       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
    1465        1603 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1466        1603 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1467        1603 :       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
    1468        1603 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1469        1603 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1470             :     }
    1471             :   }
    1472             : 
    1473             : #if 0
    1474             :   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
    1475             :   // TODO: how does LC distinguish between function entry and main entry?
    1476             :   // If this is the entry to a function, force a wait.
    1477             :   MachineBasicBlock &Entry = Block.getParent()->front();
    1478             :   if (Entry.getNumber() == Block.getNumber()) {
    1479             :     ScoreBrackets->setWaitAtBeginning();
    1480             :     return;
    1481             :   }
    1482             : #endif
    1483             : 
    1484             :   // Now set the current Block's brackets to the largest ending bracket.
    1485      142779 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1486       61191 :        T = (enum InstCounterType)(T + 1)) {
    1487       61191 :     ScoreBrackets->setScoreUB(T, MaxPending[T]);
    1488             :     ScoreBrackets->setScoreLB(T, 0);
    1489       61191 :     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
    1490             :   }
    1491             : 
    1492             :   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
    1493             : 
    1494             :   // Set the register scoreboard.
    1495       24208 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1496          92 :     if (!BlockVisitedSet.count(Pred)) {
    1497          92 :       continue;
    1498             :     }
    1499             : 
    1500             :     BlockWaitcntBrackets *PredScoreBrackets =
    1501             :         BlockWaitcntBracketsMap[Pred].get();
    1502             : 
    1503             :     // Now merge the gpr_reg_score information
    1504       26033 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1505       11157 :          T = (enum InstCounterType)(T + 1)) {
    1506             :       int PredLB = PredScoreBrackets->getScoreLB(T);
    1507             :       int PredUB = PredScoreBrackets->getScoreUB(T);
    1508       11157 :       if (PredLB < PredUB) {
    1509        1926 :         int PredScale = MaxPending[T] - PredUB;
    1510             :         // Merge vgpr scores.
    1511      142240 :         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
    1512             :           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
    1513       70157 :           if (PredRegScore <= PredLB)
    1514       64567 :             continue;
    1515        5590 :           int NewRegScore = PredScale + PredRegScore;
    1516        5590 :           ScoreBrackets->setRegScore(
    1517       11180 :               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1518             :         }
    1519             :         // Also need to merge sgpr scores for lgkm_cnt.
    1520        1926 :         if (T == LGKM_CNT) {
    1521        8167 :           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
    1522             :             int PredRegScore =
    1523        3710 :                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1524        3710 :             if (PredRegScore <= PredLB)
    1525        2664 :               continue;
    1526        1046 :             int NewRegScore = PredScale + PredRegScore;
    1527        1046 :             ScoreBrackets->setRegScore(
    1528             :                 J + NUM_ALL_VGPRS, LGKM_CNT,
    1529             :                 std::max(
    1530        2092 :                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1531             :                     NewRegScore));
    1532             :           }
    1533             :         }
    1534             :       }
    1535             :     }
    1536             : 
    1537             :     // Also merge the WaitEvent information.
    1538       78099 :     ForAllWaitEventType(W) {
    1539             :       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
    1540             :       int PredEventUB = PredScoreBrackets->getEventUB(W);
    1541       37190 :       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
    1542             :         int NewEventUB =
    1543        3866 :             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
    1544        1933 :         if (NewEventUB > 0) {
    1545        1933 :           ScoreBrackets->setEventUB(
    1546        3866 :               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1547             :         }
    1548             :       }
    1549             :     }
    1550             :   }
    1551             : 
    1552             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1553             :   // Set the register scoreboard.
    1554       38316 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1555        5105 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1556             :       // Now merge the gpr_reg_score information.
    1557       11221 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1558        4809 :            T = (enum InstCounterType)(T + 1)) {
    1559        4809 :         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
    1560             :         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
    1561        4809 :         if (PredLB < PredUB) {
    1562         513 :           int PredScale = MaxPending[T] - PredUB;
    1563             :           // Merge vgpr scores.
    1564        4722 :           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
    1565             :             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
    1566        1232 :             if (PredRegScore <= PredLB)
    1567        1221 :               continue;
    1568          11 :             int NewRegScore = PredScale + PredRegScore;
    1569          11 :             ScoreBrackets->setRegScore(
    1570          22 :                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1571             :           }
    1572             :           // Also need to merge sgpr scores for lgkm_cnt.
    1573         513 :           if (T == LGKM_CNT) {
    1574       18999 :             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
    1575             :               int PredRegScore =
    1576        5995 :                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1577        5995 :               if (PredRegScore <= PredLB)
    1578        4292 :                 continue;
    1579        1703 :               int NewRegScore = PredScale + PredRegScore;
    1580        1703 :               ScoreBrackets->setRegScore(
    1581             :                   J + NUM_ALL_VGPRS, LGKM_CNT,
    1582             :                   std::max(
    1583        3406 :                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1584             :                       NewRegScore));
    1585             :             }
    1586             :           }
    1587             :         }
    1588             :       }
    1589             : 
    1590             :       // Also merge the WaitEvent information.
    1591       33663 :       ForAllWaitEventType(W) {
    1592       16030 :         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
    1593             :         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
    1594       16030 :         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
    1595             :           int NewEventUB =
    1596        1026 :               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
    1597         513 :           if (NewEventUB > 0) {
    1598         513 :             ScoreBrackets->setEventUB(
    1599        1026 :                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1600             :           }
    1601             :         }
    1602             :       }
    1603             :     }
    1604             :   }
    1605             : 
    1606             :   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
    1607             :   // sequencing predecessors, because changes to EXEC require waitcnts due to
    1608             :   // the delayed nature of these operations.
    1609       24208 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1610          92 :     if (!BlockVisitedSet.count(Pred)) {
    1611          92 :       continue;
    1612             :     }
    1613             : 
    1614             :     BlockWaitcntBrackets *PredScoreBrackets =
    1615             :         BlockWaitcntBracketsMap[Pred].get();
    1616             : 
    1617             :     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
    1618        3719 :     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1619           0 :       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
    1620           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1621           0 :       if (new_gds_ub > 0) {
    1622           0 :         ScoreBrackets->setEventUB(
    1623             :             GDS_GPR_LOCK,
    1624           0 :             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
    1625             :       }
    1626             :     }
    1627             :     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
    1628        3719 :     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1629           4 :       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
    1630           2 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1631           2 :       if (new_exp_ub > 0) {
    1632           2 :         ScoreBrackets->setEventUB(
    1633             :             EXP_GPR_LOCK,
    1634           4 :             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
    1635             :       }
    1636             :     }
    1637             :   }
    1638             : 
    1639             :   // if a single block loop, update the score brackets. Not needed for other
    1640             :   // blocks, as we did this in-place
    1641       20397 :   if (IsSelfPred) {
    1642         840 :     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
    1643             :   }
    1644       20397 : }
    1645             : 
    1646             : /// Return true if the given basic block is a "bottom" block of a loop.
    1647             : /// This works even if the loop is discontiguous. This also handles
    1648             : /// multiple back-edges for the same "header" block of a loop.
    1649        1510 : bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
    1650             :                                     const MachineBasicBlock *Block) {
    1651        8690 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1652        6002 :     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
    1653             :       return true;
    1654             :     }
    1655             :   }
    1656             :   return false;
    1657             : }
    1658             : 
    1659             : /// Count the number of "bottom" basic blocks of a loop.
    1660         873 : unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
    1661             :   unsigned Count = 0;
    1662        4387 :   for (MachineBasicBlock *MBB : Loop->blocks()) {
    1663        1757 :     if (MBB->isSuccessor(Loop->getHeader())) {
    1664         989 :       Count++;
    1665             :     }
    1666             :   }
    1667         873 :   return Count;
    1668             : }
    1669             : 
    1670             : // Generate s_waitcnt instructions where needed.
    1671       20397 : void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    1672             :                                             MachineBasicBlock &Block) {
    1673             :   // Initialize the state information.
    1674       20397 :   mergeInputScoreBrackets(Block);
    1675             : 
    1676       40794 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1677             : 
    1678             :   LLVM_DEBUG({
    1679             :     dbgs() << "*** Block" << Block.getNumber() << " ***";
    1680             :     ScoreBrackets->dump();
    1681             :   });
    1682             : 
    1683             :   // Walk over the instructions.
    1684       20397 :   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
    1685      325716 :        Iter != E;) {
    1686             :     MachineInstr &Inst = *Iter;
    1687             :     // Remove any previously existing waitcnts.
    1688      613154 :     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
    1689             :       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
    1690             :       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
    1691             :       // as needed.
    1692             :       if (!TrackedWaitcntSet.count(&Inst))
    1693             :         ++Iter;
    1694             :       else {
    1695             :         ++Iter;
    1696         198 :         Inst.removeFromParent();
    1697             :       }
    1698             :       ScoreBrackets->setWaitcnt(&Inst);
    1699             :       continue;
    1700             :     }
    1701             : 
    1702             :     // Kill instructions generate a conditional branch to the endmain block.
    1703             :     // Merge the current waitcnt state into the endmain block information.
    1704             :     // TODO: Are there other flavors of KILL instruction?
    1705      302803 :     if (Inst.getOpcode() == AMDGPU::KILL) {
    1706         199 :       addKillWaitBracket(ScoreBrackets);
    1707             :     }
    1708             : 
    1709             :     bool VCCZBugWorkAround = false;
    1710             :     if (readsVCCZ(Inst) &&
    1711             :         (!VCCZBugHandledSet.count(&Inst))) {
    1712             :       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1713         287 :               ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1714          30 :           ScoreBrackets->hasPendingSMEM()) {
    1715           9 :         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
    1716             :           VCCZBugWorkAround = true;
    1717             :       }
    1718             :     }
    1719             : 
    1720             :     // Generate an s_waitcnt instruction to be placed before
    1721             :     // cur_Inst, if needed.
    1722      302803 :     generateWaitcntInstBefore(Inst, ScoreBrackets);
    1723             : 
    1724      302803 :     updateEventWaitcntAfter(Inst, ScoreBrackets);
    1725             : 
    1726             : #if 0 // TODO: implement resource type check controlled by options with ub = LB.
    1727             :     // If this instruction generates a S_SETVSKIP because it is an
    1728             :     // indexed resource, and we are on Tahiti, then it will also force
    1729             :     // an S_WAITCNT vmcnt(0)
    1730             :     if (RequireCheckResourceType(Inst, context)) {
    1731             :       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
    1732             :       ScoreBrackets->setScoreLB(VM_CNT,
    1733             :       ScoreBrackets->getScoreUB(VM_CNT));
    1734             :     }
    1735             : #endif
    1736             : 
    1737             :     ScoreBrackets->clearWaitcnt();
    1738             : 
    1739             :     LLVM_DEBUG({
    1740             :       Inst.print(dbgs());
    1741             :       ScoreBrackets->dump();
    1742             :     });
    1743             : 
    1744             :     // Check to see if this is a GWS instruction. If so, and if this is CI or
    1745             :     // VI, then the generated code sequence will include an S_WAITCNT 0.
    1746             :     // TODO: Are these the only GWS instructions?
    1747      605606 :     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
    1748      302803 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
    1749      302803 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
    1750      605606 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
    1751             :         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
    1752             :       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
    1753           0 :       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1754           0 :       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1755           0 :       ScoreBrackets->updateByWait(LGKM_CNT,
    1756             :                                   ScoreBrackets->getScoreUB(LGKM_CNT));
    1757             :     }
    1758             : 
    1759             :     // TODO: Remove this work-around after fixing the scheduler and enable the
    1760             :     // assert above.
    1761      302803 :     if (VCCZBugWorkAround) {
    1762             :       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    1763             :       // bit is updated, so we can restore the bit by reading the value of
    1764             :       // vcc and then writing it back to the register.
    1765          24 :       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    1766           8 :               AMDGPU::VCC)
    1767           8 :           .addReg(AMDGPU::VCC);
    1768          16 :       VCCZBugHandledSet.insert(&Inst);
    1769             :     }
    1770             : 
    1771             :     ++Iter;
    1772             :   }
    1773             : 
    1774             :   // Check if we need to force convergence at loop footer.
    1775       40794 :   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
    1776       20397 :   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
    1777         451 :     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1778             :     WaitcntData->print();
    1779             :     LLVM_DEBUG(dbgs() << '\n';);
    1780             : 
    1781             :     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
    1782             :     // placement, but doesn't guarantee convergence for a loop. Each
    1783             :     // loop should take at most (n+1) iterations for it to converge naturally,
    1784             :     // where n is the number of bottom blocks. If this threshold is reached and
    1785             :     // the result hasn't converged, then we force convergence by inserting
    1786             :     // a s_waitcnt at the end of loop footer.
    1787         902 :     if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
    1788             :       // To ensure convergence, need to make wait events at loop footer be no
    1789             :       // more than those from the previous iteration.
    1790             :       // As a simplification, instead of tracking individual scores and
    1791             :       // generating the precise wait count, just wait on 0.
    1792             :       bool HasPending = false;
    1793           0 :       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
    1794           0 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1795           0 :            T = (enum InstCounterType)(T + 1)) {
    1796           0 :         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    1797             :           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1798             :           HasPending = true;
    1799             :           break;
    1800             :         }
    1801             :       }
    1802             : 
    1803             :       if (HasPending) {
    1804           0 :         if (!SWaitInst) {
    1805           0 :           SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
    1806           0 :                               DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1807             :                               .addImm(0);
    1808             :           TrackedWaitcntSet.insert(SWaitInst);
    1809             : #if 0 // TODO: Format the debug output
    1810             :           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
    1811             :           OutputTransformAdd(SWaitInst, context);
    1812             : #endif
    1813             :         }
    1814             : #if 0 // TODO: ??
    1815             :         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
    1816             : #endif
    1817             :       }
    1818             : 
    1819           0 :       if (SWaitInst) {
    1820             :         LLVM_DEBUG({
    1821             :           SWaitInst->print(dbgs());
    1822             :           dbgs() << "\nAdjusted score board:";
    1823             :           ScoreBrackets->dump();
    1824             :         });
    1825             : 
    1826             :         // Add this waitcnt to the block. It is either newly created or
    1827             :         // created in previous iterations and added back since block traversal
    1828             :         // always removes waitcnts.
    1829           0 :         insertWaitcntBeforeCF(Block, SWaitInst);
    1830           0 :         WaitcntData->setWaitcnt(SWaitInst);
    1831             :       }
    1832             :     }
    1833             :   }
    1834       20397 : }
    1835             : 
    1836       17879 : bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    1837       17879 :   ST = &MF.getSubtarget<SISubtarget>();
    1838       17879 :   TII = ST->getInstrInfo();
    1839       17879 :   TRI = &TII->getRegisterInfo();
    1840       17879 :   MRI = &MF.getRegInfo();
    1841       17879 :   MLI = &getAnalysis<MachineLoopInfo>();
    1842       35758 :   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
    1843       17879 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1844       35758 :   AMDGPUASI = ST->getAMDGPUAS();
    1845             : 
    1846       17879 :   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
    1847      125153 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1848       53637 :        T = (enum InstCounterType)(T + 1))
    1849       53637 :     ForceEmitWaitcnt[T] = false;
    1850             : 
    1851       17879 :   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
    1852       17879 :   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
    1853       17879 :   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
    1854             : 
    1855       35758 :   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
    1856       35758 :   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
    1857             :   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
    1858             :   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
    1859             : 
    1860       35758 :   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
    1861       17879 :   RegisterEncoding.VGPRL =
    1862       17879 :       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
    1863       17879 :   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
    1864       17879 :   RegisterEncoding.SGPRL =
    1865       17879 :       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
    1866             : 
    1867             :   TrackedWaitcntSet.clear();
    1868             :   BlockVisitedSet.clear();
    1869             :   VCCZBugHandledSet.clear();
    1870       17879 :   LoopWaitcntDataMap.clear();
    1871             : 
    1872             :   // Walk over the blocks in reverse post-dominator order, inserting
    1873             :   // s_waitcnt where needed.
    1874             :   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    1875             :   bool Modified = false;
    1876             :   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1877             :            I = RPOT.begin(),
    1878             :            E = RPOT.end(), J = RPOT.begin();
    1879       38276 :        I != E;) {
    1880       20397 :     MachineBasicBlock &MBB = **I;
    1881             : 
    1882       40794 :     BlockVisitedSet.insert(&MBB);
    1883             : 
    1884       40794 :     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1885       20397 :     if (!ScoreBrackets) {
    1886       23223 :       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
    1887       15482 :       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1888             :     }
    1889       20397 :     ScoreBrackets->setPostOrder(MBB.getNumber());
    1890       40794 :     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
    1891       21152 :     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
    1892             :       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
    1893             : 
    1894             :     // If we are walking into the block from before the loop, then guarantee
    1895             :     // at least 1 re-walk over the loop to propagate the information, even if
    1896             :     // no S_WAITCNT instructions were generated.
    1897       21152 :     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
    1898         422 :       unsigned Count = countNumBottomBlocks(ContainingLoop);
    1899             : 
    1900             :       // If the loop has multiple back-edges, and so more than one "bottom"
    1901             :       // basic block, we have to guarantee a re-walk over every blocks.
    1902         422 :       if ((std::count(BlockWaitcntProcessedSet.begin(),
    1903         422 :                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
    1904         410 :         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
    1905             :         LLVM_DEBUG(dbgs() << "set-revisit1: Block"
    1906             :                           << ContainingLoop->getHeader()->getNumber() << '\n';);
    1907             :       }
    1908             :     }
    1909             : 
    1910             :     // Walk over the instructions.
    1911       20397 :     insertWaitcntInBlock(MF, MBB);
    1912             : 
    1913             :     // Record that waitcnts have been processed at least once for this block.
    1914       40794 :     BlockWaitcntProcessedSet.push_back(&MBB);
    1915             : 
    1916             :     // See if we want to revisit the loop. If a loop has multiple back-edges,
    1917             :     // we shouldn't revisit the same "bottom" basic block.
    1918       20848 :     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
    1919             :         std::count(BlockWaitcntProcessedSet.begin(),
    1920             :                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
    1921         414 :       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
    1922             :       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
    1923         207 :       if (EntrySB && EntrySB->getRevisitLoop()) {
    1924             :         EntrySB->setRevisitLoop(false);
    1925             :         J = I;
    1926         205 :         int32_t PostOrder = EntrySB->getPostOrder();
    1927             :         // TODO: Avoid this loop. Find another way to set I.
    1928             :         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1929             :                  X = RPOT.begin(),
    1930             :                  Y = RPOT.end();
    1931         509 :              X != Y; ++X) {
    1932         509 :           MachineBasicBlock &MBBX = **X;
    1933         509 :           if (MBBX.getNumber() == PostOrder) {
    1934             :             I = X;
    1935             :             break;
    1936             :           }
    1937             :         }
    1938             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1939             :         WaitcntData->incIterCnt();
    1940             :         LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
    1941         205 :         continue;
    1942             :       } else {
    1943             :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1944             :         // Loop converged, reset iteration count. If this loop gets revisited,
    1945             :         // it must be from an outer loop, the counter will restart, this will
    1946             :         // ensure we don't force convergence on such revisits.
    1947             :         WaitcntData->resetIterCnt();
    1948             :       }
    1949             :     }
    1950             : 
    1951             :     J = I;
    1952             :     ++I;
    1953             :   }
    1954             : 
    1955             :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
    1956             : 
    1957             :   bool HaveScalarStores = false;
    1958             : 
    1959       37882 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
    1960             :        ++BI) {
    1961             :     MachineBasicBlock &MBB = *BI;
    1962             : 
    1963      372757 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
    1964             :          ++I) {
    1965      665246 :       if (!HaveScalarStores && TII->isScalarStore(*I))
    1966             :         HaveScalarStores = true;
    1967             : 
    1968      665502 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
    1969             :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
    1970       16475 :         EndPgmBlocks.push_back(&MBB);
    1971             :     }
    1972             :   }
    1973             : 
    1974       17879 :   if (HaveScalarStores) {
    1975             :     // If scalar writes are used, the cache must be flushed or else the next
    1976             :     // wave to reuse the same scratch memory can be clobbered.
    1977             :     //
    1978             :     // Insert s_dcache_wb at wave termination points if there were any scalar
    1979             :     // stores, and only if the cache hasn't already been flushed. This could be
    1980             :     // improved by looking across blocks for flushes in postdominating blocks
    1981             :     // from the stores but an explicitly requested flush is probably very rare.
    1982          46 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
    1983             :       bool SeenDCacheWB = false;
    1984             : 
    1985         166 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
    1986             :            ++I) {
    1987         268 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
    1988             :           SeenDCacheWB = true;
    1989         132 :         else if (TII->isScalarStore(*I))
    1990             :           SeenDCacheWB = false;
    1991             : 
    1992             :         // FIXME: It would be better to insert this before a waitcnt if any.
    1993         119 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
    1994         150 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
    1995             :             !SeenDCacheWB) {
    1996             :           Modified = true;
    1997          30 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
    1998             :         }
    1999             :       }
    2000             :     }
    2001             :   }
    2002             : 
    2003       17879 :   if (!MFI->isEntryFunction()) {
    2004             :     // Wait for any outstanding memory operations that the input registers may
    2005             :     // depend on. We can't track them and it's better to the wait after the
    2006             :     // costly call sequence.
    2007             : 
    2008             :     // TODO: Could insert earlier and schedule more liberally with operations
    2009             :     // that only use caller preserved registers.
    2010             :     MachineBasicBlock &EntryBB = MF.front();
    2011        5656 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    2012             :       .addImm(0);
    2013             : 
    2014             :     Modified = true;
    2015             :   }
    2016             : 
    2017       17879 :   return Modified;
    2018      303507 : }

Generated by: LCOV version 1.13