LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaitcnts.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 610 714 85.4 %
Date: 2017-09-14 15:23:50 Functions: 22 23 95.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/DenseMap.h"
      27             : #include "llvm/ADT/DenseSet.h"
      28             : #include "llvm/ADT/PostOrderIterator.h"
      29             : #include "llvm/ADT/STLExtras.h"
      30             : #include "llvm/ADT/SmallVector.h"
      31             : #include "llvm/CodeGen/MachineBasicBlock.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineFunctionPass.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineLoopInfo.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineOperand.h"
      39             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      40             : #include "llvm/IR/DebugLoc.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Debug.h"
      43             : #include "llvm/Support/ErrorHandling.h"
      44             : #include "llvm/Support/raw_ostream.h"
      45             : #include <algorithm>
      46             : #include <cassert>
      47             : #include <cstdint>
      48             : #include <cstring>
      49             : #include <memory>
      50             : #include <utility>
      51             : #include <vector>
      52             : 
      53             : #define DEBUG_TYPE "si-insert-waitcnts"
      54             : 
      55             : using namespace llvm;
      56             : 
      57             : namespace {
      58             : 
      59             : // Class of object that encapsulates latest instruction counter score
      60             : // associated with the operand.  Used for determining whether
      61             : // s_waitcnt instruction needs to be emited.
      62             : 
      63             : #define CNT_MASK(t) (1u << (t))
      64             : 
      65             : enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
      66             : 
      67             : using RegInterval = std::pair<signed, signed>;
      68             : 
      69             : struct {
      70             :   int32_t VmcntMax;
      71             :   int32_t ExpcntMax;
      72             :   int32_t LgkmcntMax;
      73             :   int32_t NumVGPRsMax;
      74             :   int32_t NumSGPRsMax;
      75             : } HardwareLimits;
      76             : 
      77             : struct {
      78             :   unsigned VGPR0;
      79             :   unsigned VGPRL;
      80             :   unsigned SGPR0;
      81             :   unsigned SGPRL;
      82             : } RegisterEncoding;
      83             : 
      84             : enum WaitEventType {
      85             :   VMEM_ACCESS,      // vector-memory read & write
      86             :   LDS_ACCESS,       // lds read & write
      87             :   GDS_ACCESS,       // gds read & write
      88             :   SQ_MESSAGE,       // send message
      89             :   SMEM_ACCESS,      // scalar-memory read & write
      90             :   EXP_GPR_LOCK,     // export holding on its data src
      91             :   GDS_GPR_LOCK,     // GDS holding on its data and addr src
      92             :   EXP_POS_ACCESS,   // write to export position
      93             :   EXP_PARAM_ACCESS, // write to export parameter
      94             :   VMW_GPR_LOCK,     // vector-memory write holding on its data src
      95             :   NUM_WAIT_EVENTS,
      96             : };
      97             : 
      98             : // The mapping is:
      99             : //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
     100             : //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
     101             : //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
     102             : // We reserve a fixed number of VGPR slots in the scoring tables for
     103             : // special tokens like SCMEM_LDS (needed for buffer load to LDS).
     104             : enum RegisterMapping {
     105             :   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
     106             :   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
     107             :   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
     108             :   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
     109             :   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
     110             : };
     111             : 
     112             : #define ForAllWaitEventType(w)                                                 \
     113             :   for (enum WaitEventType w = (enum WaitEventType)0;                           \
     114             :        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
     115             :        (w) = (enum WaitEventType)((w) + 1))
     116             : 
     117             : // This is a per-basic-block object that maintains current score brackets
     118             : // of each wait-counter, and a per-register scoreboard for each wait-couner.
     119             : // We also maintain the latest score for every event type that can change the
     120             : // waitcnt in order to know if there are multiple types of events within
     121             : // the brackets. When multiple types of event happen in the bracket,
     122             : // wait-count may get decreased out of order, therefore we need to put in
     123             : // "s_waitcnt 0" before use.
     124             : class BlockWaitcntBrackets {
     125             : public:
     126        5626 :   BlockWaitcntBrackets() {
     127       22504 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     128       16878 :          T = (enum InstCounterType)(T + 1)) {
     129       16878 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     130             :     }
     131        5626 :   }
     132             : 
     133             :   ~BlockWaitcntBrackets() = default;
     134             : 
     135             :   static int32_t getWaitCountMax(InstCounterType T) {
     136       37338 :     switch (T) {
     137       13814 :     case VM_CNT:
     138       13814 :       return HardwareLimits.VmcntMax;
     139       18968 :     case LGKM_CNT:
     140       18968 :       return HardwareLimits.LgkmcntMax;
     141        4556 :     case EXP_CNT:
     142       28124 :       return HardwareLimits.ExpcntMax;
     143             :     default:
     144             :       break;
     145             :     }
     146             :     return 0;
     147             :   }
     148             : 
     149             :   void setScoreLB(InstCounterType T, int32_t Val) {
     150             :     assert(T < NUM_INST_CNTS);
     151       40345 :     if (T >= NUM_INST_CNTS)
     152             :       return;
     153       92480 :     ScoreLBs[T] = Val;
     154             :   }
     155             : 
     156             :   void setScoreUB(InstCounterType T, int32_t Val) {
     157             :     assert(T < NUM_INST_CNTS);
     158             :     if (T >= NUM_INST_CNTS)
     159             :       return;
     160      165871 :     ScoreUBs[T] = Val;
     161      114808 :     if (T == EXP_CNT) {
     162       47136 :       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
     163       23568 :       if (ScoreLBs[T] < UB)
     164        4791 :         ScoreLBs[T] = UB;
     165             :     }
     166             :   }
     167             : 
     168             :   int32_t getScoreLB(InstCounterType T) {
     169             :     assert(T < NUM_INST_CNTS);
     170     2495901 :     if (T >= NUM_INST_CNTS)
     171             :       return 0;
     172     2575779 :     return ScoreLBs[T];
     173             :   }
     174             : 
     175             :   int32_t getScoreUB(InstCounterType T) {
     176             :     assert(T < NUM_INST_CNTS);
     177     2558929 :     if (T >= NUM_INST_CNTS)
     178             :       return 0;
     179     2633679 :     return ScoreUBs[T];
     180             :   }
     181             : 
     182             :   // Mapping from event to counter.
     183             :   InstCounterType eventCounter(WaitEventType E) {
     184      166588 :     switch (E) {
     185             :     case VMEM_ACCESS:
     186             :       return VM_CNT;
     187       63250 :     case LDS_ACCESS:
     188             :     case GDS_ACCESS:
     189             :     case SQ_MESSAGE:
     190             :     case SMEM_ACCESS:
     191             :       return LGKM_CNT;
     192       49458 :     case EXP_GPR_LOCK:
     193             :     case GDS_GPR_LOCK:
     194             :     case VMW_GPR_LOCK:
     195             :     case EXP_POS_ACCESS:
     196             :     case EXP_PARAM_ACCESS:
     197             :       return EXP_CNT;
     198           0 :     default:
     199           0 :       llvm_unreachable("unhandled event type");
     200             :     }
     201             :     return NUM_INST_CNTS;
     202             :   }
     203             : 
     204             :   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
     205      160503 :     if (GprNo < NUM_ALL_VGPRS) {
     206      102750 :       if (GprNo > VgprUB) {
     207       15089 :         VgprUB = GprNo;
     208             :       }
     209      102750 :       VgprScores[T][GprNo] = Val;
     210             :     } else {
     211             :       assert(T == LGKM_CNT);
     212       60611 :       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
     213       12668 :         SgprUB = GprNo - NUM_ALL_VGPRS;
     214             :       }
     215       60611 :       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     216             :     }
     217             :   }
     218             : 
     219             :   int32_t getRegScore(int GprNo, InstCounterType T) {
     220     2523681 :     if (GprNo < NUM_ALL_VGPRS) {
     221     1383259 :       return VgprScores[T][GprNo];
     222             :     }
     223     1155917 :     return SgprScores[GprNo - NUM_ALL_VGPRS];
     224             :   }
     225             : 
     226       17021 :   void clear() {
     227       17021 :     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     228       17021 :     memset(ScoreUBs, 0, sizeof(ScoreUBs));
     229       17021 :     memset(EventUBs, 0, sizeof(EventUBs));
     230       68084 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     231       51063 :          T = (enum InstCounterType)(T + 1)) {
     232       51063 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     233             :     }
     234       17021 :     memset(SgprScores, 0, sizeof(SgprScores));
     235       17021 :   }
     236             : 
     237             :   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
     238             :                              const MachineRegisterInfo *MRI,
     239             :                              const SIRegisterInfo *TRI, unsigned OpNo,
     240             :                              bool Def) const;
     241             : 
     242             :   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
     243             :                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
     244             :                    unsigned OpNo, int32_t Val);
     245             : 
     246             :   void setWaitAtBeginning() { WaitAtBeginning = true; }
     247           0 :   void clearWaitAtBeginning() { WaitAtBeginning = false; }
     248             :   bool getWaitAtBeginning() const { return WaitAtBeginning; }
     249        3115 :   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
     250             :   int32_t getMaxVGPR() const { return VgprUB; }
     251             :   int32_t getMaxSGPR() const { return SgprUB; }
     252             : 
     253             :   int32_t getEventUB(enum WaitEventType W) const {
     254             :     assert(W < NUM_WAIT_EVENTS);
     255       79481 :     return EventUBs[W];
     256             :   }
     257             : 
     258             :   bool counterOutOfOrder(InstCounterType T);
     259             :   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
     260             :   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
     261             :                      const MachineRegisterInfo *MRI, WaitEventType E,
     262             :                      MachineInstr &MI);
     263             : 
     264             :   bool hasPendingSMEM() const {
     265          35 :     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     266             :             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
     267             :   }
     268             : 
     269             :   bool hasPendingFlat() const {
     270       15314 :     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
     271       29476 :              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
     272       14596 :             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
     273          50 :              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
     274             :   }
     275             : 
     276             :   void setPendingFlat() {
     277        1353 :     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
     278        1353 :     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
     279             :   }
     280             : 
     281       15534 :   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
     282             : 
     283       51063 :   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
     284             : 
     285             :   bool getRevisitLoop() const { return RevisitLoop; }
     286         546 :   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
     287             : 
     288       17021 :   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
     289             :   int32_t getPostOrder() const { return PostOrder; }
     290             : 
     291         233 :   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
     292      277031 :   void clearWaitcnt() { Waitcnt = nullptr; }
     293             :   MachineInstr *getWaitcnt() const { return Waitcnt; }
     294             : 
     295             :   bool mixedExpTypes() const { return MixedExpTypes; }
     296             :   void setMixedExpTypes(bool MixedExpTypesIn) {
     297       21318 :     MixedExpTypes = MixedExpTypesIn;
     298             :   }
     299             : 
     300             :   void print(raw_ostream &);
     301             :   void dump() { print(dbgs()); }
     302             : 
     303             : private:
     304             :   bool WaitAtBeginning = false;
     305             :   bool RevisitLoop = false;
     306             :   bool ValidLoop = false;
     307             :   bool MixedExpTypes = false;
     308             :   MachineLoop *LoopRegion = nullptr;
     309             :   int32_t PostOrder = 0;
     310             :   MachineInstr *Waitcnt = nullptr;
     311             :   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
     312             :   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
     313             :   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
     314             :   // Remember the last flat memory operation.
     315             :   int32_t LastFlat[NUM_INST_CNTS] = {0};
     316             :   // wait_cnt scores for every vgpr.
     317             :   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
     318             :   int32_t VgprUB = 0;
     319             :   int32_t SgprUB = 0;
     320             :   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
     321             :   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
     322             :   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
     323             : };
     324             : 
     325             : // This is a per-loop-region object that records waitcnt status at the end of
     326             : // loop footer from the previous iteration. We also maintain an iteration
     327             : // count to track the number of times the loop has been visited. When it
     328             : // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
     329             : // at the end of the loop footer.
     330             : class LoopWaitcntData {
     331             : public:
     332             :   LoopWaitcntData() = default;
     333             :   ~LoopWaitcntData() = default;
     334             : 
     335         176 :   void incIterCnt() { IterCnt++; }
     336         178 :   void resetIterCnt() { IterCnt = 0; }
     337             :   int32_t getIterCnt() { return IterCnt; }
     338             : 
     339           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
     340             :   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
     341             : 
     342             :   void print() {
     343             :     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
     344             :   }
     345             : 
     346             : private:
     347             :   // s_waitcnt added at the end of loop footer to stablize wait scores
     348             :   // at the end of the loop footer.
     349             :   MachineInstr *LfWaitcnt = nullptr;
     350             :   // Number of iterations the loop has been visited, not including the initial
     351             :   // walk over.
     352             :   int32_t IterCnt = 0;
     353             : };
     354             : 
     355       11680 : class SIInsertWaitcnts : public MachineFunctionPass {
     356             : private:
     357             :   const SISubtarget *ST = nullptr;
     358             :   const SIInstrInfo *TII = nullptr;
     359             :   const SIRegisterInfo *TRI = nullptr;
     360             :   const MachineRegisterInfo *MRI = nullptr;
     361             :   const MachineLoopInfo *MLI = nullptr;
     362             :   AMDGPU::IsaInfo::IsaVersion IV;
     363             :   AMDGPUAS AMDGPUASI;
     364             : 
     365             :   DenseSet<MachineBasicBlock *> BlockVisitedSet;
     366             :   DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
     367             :   DenseSet<MachineInstr *> VCCZBugHandledSet;
     368             : 
     369             :   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
     370             :       BlockWaitcntBracketsMap;
     371             : 
     372             :   DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
     373             : 
     374             :   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
     375             : 
     376             :   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
     377             : 
     378             : public:
     379             :   static char ID;
     380             : 
     381       11744 :   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
     382             : 
     383             :   bool runOnMachineFunction(MachineFunction &MF) override;
     384             : 
     385        1461 :   StringRef getPassName() const override {
     386        1461 :     return "SI insert wait instructions";
     387             :   }
     388             : 
     389        1461 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     390        1461 :     AU.setPreservesCFG();
     391        1461 :     AU.addRequired<MachineLoopInfo>();
     392        1461 :     MachineFunctionPass::getAnalysisUsage(AU);
     393        1461 :   }
     394             : 
     395         270 :   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     396             :     // The waitcnt information is copied because it changes as the block is
     397             :     // traversed.
     398         540 :     KillWaitBrackets.push_back(
     399         540 :         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
     400         270 :   }
     401             : 
     402             :   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
     403             :   MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
     404             :                                            BlockWaitcntBrackets *ScoreBrackets);
     405             :   void updateEventWaitCntAfter(MachineInstr &Inst,
     406             :                                BlockWaitcntBrackets *ScoreBrackets);
     407             :   void mergeInputScoreBrackets(MachineBasicBlock &Block);
     408             :   MachineBasicBlock *loopBottom(const MachineLoop *Loop);
     409             :   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
     410             :   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
     411             : };
     412             : 
     413             : } // end anonymous namespace
     414             : 
     415     3081941 : RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
     416             :                                                  const SIInstrInfo *TII,
     417             :                                                  const MachineRegisterInfo *MRI,
     418             :                                                  const SIRegisterInfo *TRI,
     419             :                                                  unsigned OpNo,
     420             :                                                  bool Def) const {
     421     6163882 :   const MachineOperand &Op = MI->getOperand(OpNo);
     422     5060527 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
     423     1103897 :       (Def && !Op.isDef()))
     424     3881632 :     return {-1, -1};
     425             : 
     426             :   // A use via a PW operand does not need a waitcnt.
     427             :   // A partial write is not a WAW.
     428             :   assert(!Op.getSubReg() || !Op.isUndef());
     429             : 
     430     1141125 :   RegInterval Result;
     431     1141125 :   const MachineRegisterInfo &MRIA = *MRI;
     432             : 
     433     2282250 :   unsigned Reg = TRI->getEncodingValue(Op.getReg());
     434             : 
     435     1141125 :   if (TRI->isVGPR(MRIA, Op.getReg())) {
     436             :     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     437      493884 :     Result.first = Reg - RegisterEncoding.VGPR0;
     438             :     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     439      647241 :   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
     440             :     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     441      647241 :     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     442             :     assert(Result.first >= NUM_ALL_VGPRS &&
     443             :            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
     444             :   }
     445             :   // TODO: Handle TTMP
     446             :   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
     447             :   else
     448           0 :     return {-1, -1};
     449             : 
     450     1141125 :   const MachineInstr &MIA = *MI;
     451     1141125 :   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
     452     2282250 :   unsigned Size = TRI->getRegSizeInBits(*RC);
     453     1141125 :   Result.second = Result.first + (Size / 32);
     454             : 
     455     1141125 :   return Result;
     456             : }
     457             : 
     458       24786 : void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
     459             :                                        const SIInstrInfo *TII,
     460             :                                        const SIRegisterInfo *TRI,
     461             :                                        const MachineRegisterInfo *MRI,
     462             :                                        unsigned OpNo, int32_t Val) {
     463       24786 :   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
     464             :   DEBUG({
     465             :     const MachineOperand &Opnd = MI->getOperand(OpNo);
     466             :     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
     467             :   });
     468       66512 :   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     469       41726 :     setRegScore(RegNo, EXP_CNT, Val);
     470             :   }
     471       24786 : }
     472             : 
     473      114808 : void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     474             :                                          const SIRegisterInfo *TRI,
     475             :                                          const MachineRegisterInfo *MRI,
     476             :                                          WaitEventType E, MachineInstr &Inst) {
     477      114808 :   const MachineRegisterInfo &MRIA = *MRI;
     478      229616 :   InstCounterType T = eventCounter(E);
     479      114808 :   int32_t CurrScore = getScoreUB(T) + 1;
     480             :   // EventUB and ScoreUB need to be update regardless if this event changes
     481             :   // the score of a register or not.
     482             :   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
     483      114808 :   EventUBs[E] = CurrScore;
     484      114808 :   setScoreUB(T, CurrScore);
     485             : 
     486      114808 :   if (T == EXP_CNT) {
     487             :     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
     488             :     // is required.
     489       23568 :     if (!MixedExpTypes) {
     490       23568 :       MixedExpTypes = counterOutOfOrder(EXP_CNT);
     491             :     }
     492             : 
     493             :     // Put score on the source vgprs. If this is a store, just use those
     494             :     // specific register(s).
     495       23568 :     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
     496             :       // All GDS operations must protect their address register (same as
     497             :       // export.)
     498           0 :       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
     499           0 :           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
     500           0 :         setExpScore(
     501             :             &Inst, TII, TRI, MRI,
     502           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
     503             :             CurrScore);
     504             :       }
     505           0 :       if (Inst.mayStore()) {
     506           0 :         setExpScore(
     507             :             &Inst, TII, TRI, MRI,
     508           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
     509             :             CurrScore);
     510           0 :         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     511             :                                        AMDGPU::OpName::data1) != -1) {
     512           0 :           setExpScore(&Inst, TII, TRI, MRI,
     513           0 :                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     514             :                                                  AMDGPU::OpName::data1),
     515             :                       CurrScore);
     516             :         }
     517           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
     518           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
     519           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
     520           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
     521           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
     522           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
     523           0 :                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
     524           0 :                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
     525           0 :                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
     526           0 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     527           0 :           const MachineOperand &Op = Inst.getOperand(I);
     528           0 :           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
     529           0 :             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     530             :           }
     531             :         }
     532             :       }
     533       23568 :     } else if (TII->isFLAT(Inst)) {
     534           0 :       if (Inst.mayStore()) {
     535           0 :         setExpScore(
     536             :             &Inst, TII, TRI, MRI,
     537           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     538             :             CurrScore);
     539           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     540           0 :         setExpScore(
     541             :             &Inst, TII, TRI, MRI,
     542           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     543             :             CurrScore);
     544             :       }
     545       23568 :     } else if (TII->isMIMG(Inst)) {
     546          56 :       if (Inst.mayStore()) {
     547          56 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     548           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     549           0 :         setExpScore(
     550             :             &Inst, TII, TRI, MRI,
     551           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     552             :             CurrScore);
     553             :       }
     554       23512 :     } else if (TII->isMTBUF(Inst)) {
     555          46 :       if (Inst.mayStore()) {
     556          46 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     557             :       }
     558       23466 :     } else if (TII->isMUBUF(Inst)) {
     559       23060 :       if (Inst.mayStore()) {
     560       23060 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     561           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     562           0 :         setExpScore(
     563             :             &Inst, TII, TRI, MRI,
     564           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     565             :             CurrScore);
     566             :       }
     567             :     } else {
     568         406 :       if (TII->isEXP(Inst)) {
     569             :         // For export the destination registers are really temps that
     570             :         // can be used as the actual source after export patching, so
     571             :         // we need to treat them like sources and set the EXP_CNT
     572             :         // score.
     573        4060 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     574        7308 :           MachineOperand &DefMO = Inst.getOperand(I);
     575        5684 :           if (DefMO.isReg() && DefMO.isDef() &&
     576           0 :               TRI->isVGPR(MRIA, DefMO.getReg())) {
     577           0 :             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
     578             :                         CurrScore);
     579             :           }
     580             :         }
     581             :       }
     582        4060 :       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     583        7308 :         MachineOperand &MO = Inst.getOperand(I);
     584        5684 :         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
     585        1624 :           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     586             :         }
     587             :       }
     588             :     }
     589             : #if 0 // TODO: check if this is handled by MUBUF code above.
     590             :   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
     591             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
     592             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     593             :     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     594             :     unsigned OpNo;//TODO: find the OpNo for this operand;
     595             :     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     596             :     for (signed RegNo = Interval.first; RegNo < Interval.second;
     597             :     ++RegNo) {
     598             :       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     599             :     }
     600             : #endif
     601             :   } else {
     602             :     // Match the score to the destination registers.
     603      689191 :     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     604      597951 :       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
     605      597951 :       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
     606           0 :         continue;
     607      811789 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     608      106919 :         setRegScore(RegNo, T, CurrScore);
     609             :       }
     610             :     }
     611       91240 :     if (TII->isDS(Inst) && Inst.mayStore()) {
     612             :       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     613             :     }
     614             :   }
     615      114808 : }
     616             : 
     617             : void BlockWaitcntBrackets::print(raw_ostream &OS) {
     618             :   OS << '\n';
     619             :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     620             :        T = (enum InstCounterType)(T + 1)) {
     621             :     int LB = getScoreLB(T);
     622             :     int UB = getScoreUB(T);
     623             : 
     624             :     switch (T) {
     625             :     case VM_CNT:
     626             :       OS << "    VM_CNT(" << UB - LB << "): ";
     627             :       break;
     628             :     case LGKM_CNT:
     629             :       OS << "    LGKM_CNT(" << UB - LB << "): ";
     630             :       break;
     631             :     case EXP_CNT:
     632             :       OS << "    EXP_CNT(" << UB - LB << "): ";
     633             :       break;
     634             :     default:
     635             :       OS << "    UNKNOWN(" << UB - LB << "): ";
     636             :       break;
     637             :     }
     638             : 
     639             :     if (LB < UB) {
     640             :       // Print vgpr scores.
     641             :       for (int J = 0; J <= getMaxVGPR(); J++) {
     642             :         int RegScore = getRegScore(J, T);
     643             :         if (RegScore <= LB)
     644             :           continue;
     645             :         int RelScore = RegScore - LB - 1;
     646             :         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
     647             :           OS << RelScore << ":v" << J << " ";
     648             :         } else {
     649             :           OS << RelScore << ":ds ";
     650             :         }
     651             :       }
     652             :       // Also need to print sgpr scores for lgkm_cnt.
     653             :       if (T == LGKM_CNT) {
     654             :         for (int J = 0; J <= getMaxSGPR(); J++) {
     655             :           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
     656             :           if (RegScore <= LB)
     657             :             continue;
     658             :           int RelScore = RegScore - LB - 1;
     659             :           OS << RelScore << ":s" << J << " ";
     660             :         }
     661             :       }
     662             :     }
     663             :     OS << '\n';
     664             :   }
     665             :   OS << '\n';
     666             : }
     667             : 
     668     2444121 : unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
     669             :                                                 int ScoreToWait) {
     670     2444121 :   unsigned int NeedWait = 0;
     671     2444121 :   if (ScoreToWait == -1) {
     672             :     // The score to wait is unknown. This implies that it was not encountered
     673             :     // during the path of the CFG walk done during the current traversal but
     674             :     // may be seen on a different path. Emit an s_wait counter with a
     675             :     // conservative value of 0 for the counter.
     676           0 :     NeedWait = CNT_MASK(T);
     677           0 :     setScoreLB(T, getScoreUB(T));
     678             :     return NeedWait;
     679             :   }
     680             : 
     681             :   // If the score of src_operand falls within the bracket, we need an
     682             :   // s_waitcnt instruction.
     683     2444121 :   const int32_t LB = getScoreLB(T);
     684     2444121 :   const int32_t UB = getScoreUB(T);
     685     2444121 :   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     686       40345 :     if (T == VM_CNT && hasPendingFlat()) {
     687             :       // If there is a pending FLAT operation, and this is a VM waitcnt,
     688             :       // then we need to force a waitcnt 0 for VM.
     689         434 :       NeedWait = CNT_MASK(T);
     690         434 :       setScoreLB(T, getScoreUB(T));
     691       39911 :     } else if (counterOutOfOrder(T)) {
     692             :       // Counter can get decremented out-of-order when there
     693             :       // are multiple types event in the brack. Also emit an s_wait counter
     694             :       // with a conservative value of 0 for the counter.
     695       15008 :       NeedWait = CNT_MASK(T);
     696       15008 :       setScoreLB(T, getScoreUB(T));
     697             :     } else {
     698       24903 :       NeedWait = CNT_MASK(T);
     699             :       setScoreLB(T, ScoreToWait);
     700             :     }
     701             :   }
     702             : 
     703             :   return NeedWait;
     704             : }
     705             : 
     706             : // Where there are multiple types of event in the bracket of a counter,
     707             : // the decrement may go out of order.
     708       63479 : bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
     709       63479 :   switch (T) {
     710             :   case VM_CNT:
     711             :     return false;
     712       19210 :   case LGKM_CNT: {
     713       34218 :     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     714       15008 :         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     715             :       // Scalar memory read always can go out of order.
     716             :       return true;
     717             :     }
     718        4202 :     int NumEventTypes = 0;
     719        8404 :     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     720        4202 :         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     721        4202 :       NumEventTypes++;
     722             :     }
     723        4202 :     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     724           0 :         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     725           0 :       NumEventTypes++;
     726             :     }
     727        4202 :     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
     728           0 :         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
     729           0 :       NumEventTypes++;
     730             :     }
     731        4202 :     if (NumEventTypes <= 1) {
     732             :       return false;
     733             :     }
     734             :     break;
     735             :   }
     736       29773 :   case EXP_CNT: {
     737             :     // If there has been a mixture of export types, then a waitcnt exp(0) is
     738             :     // required.
     739       29773 :     if (MixedExpTypes)
     740             :       return true;
     741       29773 :     int NumEventTypes = 0;
     742       30026 :     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     743         253 :         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     744         253 :       NumEventTypes++;
     745             :     }
     746       29773 :     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     747           0 :         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     748           0 :       NumEventTypes++;
     749             :     }
     750       59134 :     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     751       29361 :         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     752       29361 :       NumEventTypes++;
     753             :     }
     754       29903 :     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
     755         130 :         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
     756         130 :       NumEventTypes++;
     757             :     }
     758             : 
     759       29810 :     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
     760          37 :         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
     761          37 :       NumEventTypes++;
     762             :     }
     763             : 
     764       29773 :     if (NumEventTypes <= 1) {
     765             :       return false;
     766             :     }
     767             :     break;
     768             :   }
     769             :   default:
     770             :     break;
     771             :   }
     772           8 :   return true;
     773             : }
     774             : 
     775       53042 : INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     776             :                       false)
     777      312538 : INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     778             :                     false)
     779             : 
     780             : char SIInsertWaitcnts::ID = 0;
     781             : 
     782             : char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
     783             : 
     784        1468 : FunctionPass *llvm::createSIInsertWaitcntsPass() {
     785        1468 :   return new SIInsertWaitcnts();
     786             : }
     787             : 
     788             : static bool readsVCCZ(const MachineInstr &MI) {
     789     1108110 :   unsigned Opc = MI.getOpcode();
     790      554487 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     791         864 :          !MI.getOperand(1).isUndef();
     792             : }
     793             : 
     794             : ///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
     795             : ///  Instructions of a given type are returned in order,
     796             : ///  but instructions of different types can complete out of order.
     797             : ///  We rely on this in-order completion
     798             : ///  and simply assign a score to the memory access instructions.
     799             : ///  We keep track of the active "score bracket" to determine
     800             : ///  if an access of a memory read requires an s_waitcnt
     801             : ///  and if so what the value of each counter is.
     802             : ///  The "score bracket" is bound by the lower bound and upper bound
     803             : ///  scores (*_score_LB and *_score_ub respectively).
     804      277031 : MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     805             :     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
     806             :   // To emit, or not to emit - that's the question!
     807             :   // Start with an assumption that there is no need to emit.
     808      277031 :   unsigned int EmitSwaitcnt = 0;
     809             :   // s_waitcnt instruction to return; default is NULL.
     810      277031 :   MachineInstr *SWaitInst = nullptr;
     811             :   // No need to wait before phi. If a phi-move exists, then the wait should
     812             :   // has been inserted before the move. If a phi-move does not exist, then
     813             :   // wait should be inserted before the real use. The same is true for
     814             :   // sc-merge. It is not a coincident that all these cases correspond to the
     815             :   // instructions that are skipped in the assembling loop.
     816      277031 :   bool NeedLineMapping = false; // TODO: Check on this.
     817      277031 :   if (MI.isDebugValue() &&
     818             :       // TODO: any other opcode?
     819             :       !NeedLineMapping) {
     820             :     return SWaitInst;
     821             :   }
     822             : 
     823             :   // See if an s_waitcnt is forced at block entry, or is needed at
     824             :   // program end.
     825      277024 :   if (ScoreBrackets->getWaitAtBeginning()) {
     826             :     // Note that we have already cleared the state, so we don't need to update
     827             :     // it.
     828           0 :     ScoreBrackets->clearWaitAtBeginning();
     829           0 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     830           0 :          T = (enum InstCounterType)(T + 1)) {
     831           0 :       EmitSwaitcnt |= CNT_MASK(T);
     832           0 :       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     833             :     }
     834             :   }
     835             : 
     836             :   // See if this instruction has a forced S_WAITCNT VM.
     837             :   // TODO: Handle other cases of NeedsWaitcntVmBefore()
     838      554046 :   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
     839      554045 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
     840      277021 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     841           5 :     EmitSwaitcnt |=
     842           5 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     843             :   }
     844             : 
     845             :   // All waits must be resolved at call return.
     846             :   // NOTE: this could be improved with knowledge of all call sites or
     847             :   //   with knowledge of the called routines.
     848      554048 :   if (MI.getOpcode() == AMDGPU::RETURN ||
     849      553785 :       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
     850      276761 :       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     851        6202 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     852        2658 :          T = (enum InstCounterType)(T + 1)) {
     853        5316 :       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
     854        1874 :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     855         937 :         EmitSwaitcnt |= CNT_MASK(T);
     856             :       }
     857             :     }
     858             :   }
     859             :   // Resolve vm waits before gs-done.
     860      552264 :   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
     861      552288 :             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
     862          24 :            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
     863             :             AMDGPU::SendMsg::ID_GS_DONE)) {
     864          12 :     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
     865           0 :       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     866           0 :       EmitSwaitcnt |= CNT_MASK(VM_CNT);
     867             :     }
     868             :   }
     869             : #if 0 // TODO: the following blocks of logic when we have fence.
     870             :   else if (MI.getOpcode() == SC_FENCE) {
     871             :     const unsigned int group_size =
     872             :       context->shader_info->GetMaxThreadGroupSize();
     873             :     // group_size == 0 means thread group size is unknown at compile time
     874             :     const bool group_is_multi_wave =
     875             :       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
     876             :     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
     877             : 
     878             :     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
     879             :       SCRegType src_type = Inst->GetSrcType(i);
     880             :       switch (src_type) {
     881             :         case SCMEM_LDS:
     882             :           if (group_is_multi_wave ||
     883             :             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
     884             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     885             :                                ScoreBrackets->getScoreUB(LGKM_CNT));
     886             :             // LDS may have to wait for VM_CNT after buffer load to LDS
     887             :             if (target_info->HasBufferLoadToLDS()) {
     888             :               EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     889             :                                  ScoreBrackets->getScoreUB(VM_CNT));
     890             :             }
     891             :           }
     892             :           break;
     893             : 
     894             :         case SCMEM_GDS:
     895             :           if (group_is_multi_wave || fence_is_global) {
     896             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     897             :               ScoreBrackets->getScoreUB(EXP_CNT));
     898             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     899             :               ScoreBrackets->getScoreUB(LGKM_CNT));
     900             :           }
     901             :           break;
     902             : 
     903             :         case SCMEM_UAV:
     904             :         case SCMEM_TFBUF:
     905             :         case SCMEM_RING:
     906             :         case SCMEM_SCATTER:
     907             :           if (group_is_multi_wave || fence_is_global) {
     908             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     909             :               ScoreBrackets->getScoreUB(EXP_CNT));
     910             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     911             :               ScoreBrackets->getScoreUB(VM_CNT));
     912             :           }
     913             :           break;
     914             : 
     915             :         case SCMEM_SCRATCH:
     916             :         default:
     917             :           break;
     918             :       }
     919             :     }
     920             :   }
     921             : #endif
     922             : 
     923             :   // Export & GDS instructions do not read the EXEC mask until after the export
     924             :   // is granted (which can occur well after the instruction is issued).
     925             :   // The shader program must flush all EXP operations on the export-count
     926             :   // before overwriting the EXEC mask.
     927             :   else {
     928      552264 :     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
     929             :       // Export and GDS are tracked individually, either may trigger a waitcnt
     930             :       // for EXEC.
     931        1928 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     932             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
     933        1928 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     934             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
     935        1928 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     936             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
     937        1928 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     938             :           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
     939             :     }
     940             : 
     941             : #if 0 // TODO: the following code to handle CALL.
     942             :     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
     943             :     // However, there is a problem with EXP_CNT, because the call cannot
     944             :     // easily tell if a register is used in the function, and if it did, then
     945             :     // the referring instruction would have to have an S_WAITCNT, which is
     946             :     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
     947             :     // before the call.
     948             :     if (MI.getOpcode() == SC_CALL) {
     949             :       if (ScoreBrackets->getScoreUB(EXP_CNT) >
     950             :         ScoreBrackets->getScoreLB(EXP_CNT)) {
     951             :         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
     952             :         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
     953             :       }
     954             :     }
     955             : #endif
     956             : 
     957             :     // FIXME: Should not be relying on memoperands.
     958             :     // Look at the source operands of every instruction to see if
     959             :     // any of them results from a previous memory operation that affects
     960             :     // its current usage. If so, an s_waitcnt instruction needs to be
     961             :     // emitted.
     962             :     // If the source operand was defined by a load, add the s_waitcnt
     963             :     // instruction.
     964      361030 :     for (const MachineMemOperand *Memop : MI.memoperands()) {
     965       84898 :       unsigned AS = Memop->getAddrSpace();
     966       84898 :       if (AS != AMDGPUASI.LOCAL_ADDRESS)
     967       77813 :         continue;
     968        7085 :       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
     969             :       // VM_CNT is only relevant to vgpr or LDS.
     970       14170 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     971             :           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     972             :     }
     973             : 
     974     1505734 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     975     2459204 :       const MachineOperand &Op = MI.getOperand(I);
     976     1229602 :       const MachineRegisterInfo &MRIA = *MRI;
     977             :       RegInterval Interval =
     978     1229602 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
     979     2578768 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     980     1349166 :         if (TRI->isVGPR(MRIA, Op.getReg())) {
     981             :           // VM_CNT is only relevant to vgpr or LDS.
     982      385963 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
     983             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     984             :         }
     985     1349166 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
     986             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
     987             :       }
     988             :     }
     989             :     // End of for loop that looks at all source operands to decide vm_wait_cnt
     990             :     // and lgk_wait_cnt.
     991             : 
     992             :     // Two cases are handled for destination operands:
     993             :     // 1) If the destination operand was defined by a load, add the s_waitcnt
     994             :     // instruction to guarantee the right WAW order.
     995             :     // 2) If a destination operand that was used by a recent export/store ins,
     996             :     // add s_waitcnt on exp_cnt to guarantee the WAR order.
     997      276132 :     if (MI.mayStore()) {
     998             :       // FIXME: Should not be relying on memoperands.
     999       65820 :       for (const MachineMemOperand *Memop : MI.memoperands()) {
    1000       33200 :         unsigned AS = Memop->getAddrSpace();
    1001       33200 :         if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1002       28995 :           continue;
    1003        4205 :         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1004        8410 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1005             :             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1006        8410 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1007             :             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1008             :       }
    1009             :     }
    1010     1505734 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1011     2459204 :       MachineOperand &Def = MI.getOperand(I);
    1012     1229602 :       const MachineRegisterInfo &MRIA = *MRI;
    1013             :       RegInterval Interval =
    1014     1229602 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
    1015     1573224 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1016      343622 :         if (TRI->isVGPR(MRIA, Def.getReg())) {
    1017      170995 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1018             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1019      170995 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1020             :               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1021             :         }
    1022      343622 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1023             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1024             :       }
    1025             :     } // End of for loop that looks at all dest operands.
    1026             :   }
    1027             : 
    1028             :   // TODO: Tie force zero to a compiler triage option.
    1029      277024 :   bool ForceZero = false;
    1030             : 
    1031             :   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
    1032             :   // occurs before the instruction. Doing it here prevents any additional
    1033             :   // S_WAITCNTs from being emitted if the instruction was marked as
    1034             :   // requiring a WAITCNT beforehand.
    1035      554106 :   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
    1036          58 :       !ST->hasAutoWaitcntBeforeBarrier()) {
    1037          56 :     EmitSwaitcnt |=
    1038          56 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1039          56 :     EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1040             :         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1041          56 :     EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1042             :         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
    1043             :   }
    1044             : 
    1045             :   // TODO: Remove this work-around, enable the assert for Bug 457939
    1046             :   //       after fixing the scheduler. Also, the Shader Compiler code is
    1047             :   //       independent of target.
    1048         216 :   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
    1049         155 :     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1050         155 :             ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1051          17 :         ScoreBrackets->hasPendingSMEM()) {
    1052             :       // Wait on everything, not just LGKM.  vccz reads usually come from
    1053             :       // terminators, and we always wait on everything at the end of the
    1054             :       // block, so if we only wait on LGKM here, we might end up with
    1055             :       // another s_waitcnt inserted right after this if there are non-LGKM
    1056             :       // instructions still outstanding.
    1057             :       ForceZero = true;
    1058             :       EmitSwaitcnt = true;
    1059             :     }
    1060             :   }
    1061             : 
    1062             :   // Does this operand processing indicate s_wait counter update?
    1063      277017 :   if (EmitSwaitcnt) {
    1064             :     int CntVal[NUM_INST_CNTS];
    1065             : 
    1066       34288 :     bool UseDefaultWaitcntStrategy = true;
    1067       34288 :     if (ForceZero) {
    1068             :       // Force all waitcnts to 0.
    1069          49 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1070          21 :            T = (enum InstCounterType)(T + 1)) {
    1071          42 :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1072             :       }
    1073           7 :       CntVal[VM_CNT] = 0;
    1074           7 :       CntVal[EXP_CNT] = 0;
    1075           7 :       CntVal[LGKM_CNT] = 0;
    1076           7 :       UseDefaultWaitcntStrategy = false;
    1077             :     }
    1078             : 
    1079             :     if (UseDefaultWaitcntStrategy) {
    1080      239967 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1081      102843 :            T = (enum InstCounterType)(T + 1)) {
    1082      102843 :         if (EmitSwaitcnt & CNT_MASK(T)) {
    1083             :           int Delta =
    1084       74676 :               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
    1085       37338 :           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
    1086       37338 :           if (Delta >= MaxDelta) {
    1087         114 :             Delta = -1;
    1088         114 :             if (T != EXP_CNT) {
    1089         114 :               ScoreBrackets->setScoreLB(
    1090         114 :                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
    1091             :             }
    1092         114 :             EmitSwaitcnt &= ~CNT_MASK(T);
    1093             :           }
    1094       37338 :           CntVal[T] = Delta;
    1095             :         } else {
    1096             :           // If we are not waiting for a particular counter then encode
    1097             :           // it as -1 which means "don't care."
    1098       65505 :           CntVal[T] = -1;
    1099             :         }
    1100             :       }
    1101             :     }
    1102             : 
    1103             :     // If we are not waiting on any counter we can skip the wait altogether.
    1104       34288 :     if (EmitSwaitcnt != 0) {
    1105       34178 :       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
    1106       34178 :       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
    1107         466 :       if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
    1108         466 :                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
    1109         233 :           (AMDGPU::decodeExpcnt(IV, Imm) !=
    1110       34644 :            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
    1111         233 :           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
    1112         233 :            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
    1113       67892 :         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
    1114         219 :         if (ContainingLoop) {
    1115         438 :           MachineBasicBlock *TBB = ContainingLoop->getHeader();
    1116             :           BlockWaitcntBrackets *ScoreBracket =
    1117         657 :               BlockWaitcntBracketsMap[TBB].get();
    1118         219 :           if (!ScoreBracket) {
    1119             :             assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
    1120           0 :             BlockWaitcntBracketsMap[TBB] =
    1121           0 :                 llvm::make_unique<BlockWaitcntBrackets>();
    1122           0 :             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
    1123             :           }
    1124         219 :           ScoreBracket->setRevisitLoop(true);
    1125             :           DEBUG(dbgs() << "set-revisit: block"
    1126             :                        << ContainingLoop->getHeader()->getNumber() << '\n';);
    1127             :         }
    1128             :       }
    1129             : 
    1130             :       // Update an existing waitcount, or make a new one.
    1131       34178 :       MachineFunction &MF = *MI.getParent()->getParent();
    1132       34411 :       if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
    1133           0 :         SWaitInst = OldWaitcnt;
    1134             :       } else {
    1135       68356 :         SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
    1136             :                                           MI.getDebugLoc());
    1137       68356 :         CompilerGeneratedWaitcntSet.insert(SWaitInst);
    1138             :       }
    1139             : 
    1140             :       const MachineOperand &Op =
    1141      102534 :           MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
    1142      170890 :               IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
    1143       34178 :       SWaitInst->addOperand(MF, Op);
    1144             : 
    1145       34178 :       if (CntVal[EXP_CNT] == 0) {
    1146             :         ScoreBrackets->setMixedExpTypes(false);
    1147             :       }
    1148             :     }
    1149             :   }
    1150             : 
    1151      277024 :   return SWaitInst;
    1152             : }
    1153             : 
    1154           0 : void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
    1155             :                                              MachineInstr *Waitcnt) {
    1156           0 :   if (MBB.empty()) {
    1157           0 :     MBB.push_back(Waitcnt);
    1158           0 :     return;
    1159             :   }
    1160             : 
    1161           0 :   MachineBasicBlock::iterator It = MBB.end();
    1162           0 :   MachineInstr *MI = &*(--It);
    1163           0 :   if (MI->isBranch()) {
    1164             :     MBB.insert(It, Waitcnt);
    1165             :   } else {
    1166             :     MBB.push_back(Waitcnt);
    1167             :   }
    1168             : }
    1169             : 
    1170             : // This is a flat memory operation. Check to see if it has memory
    1171             : // tokens for both LDS and Memory, and if so mark it as a flat.
    1172        7967 : bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
    1173        7967 :   if (MI.memoperands_empty())
    1174             :     return true;
    1175             : 
    1176       14570 :   for (const MachineMemOperand *Memop : MI.memoperands()) {
    1177        7956 :     unsigned AS = Memop->getAddrSpace();
    1178        7956 :     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
    1179             :       return true;
    1180             :   }
    1181             : 
    1182             :   return false;
    1183             : }
    1184             : 
    1185      277031 : void SIInsertWaitcnts::updateEventWaitCntAfter(
    1186             :     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
    1187             :   // Now look at the instruction opcode. If it is a memory access
    1188             :   // instruction, update the upper-bound of the appropriate counter's
    1189             :   // bracket and the destination operand scores.
    1190             :   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
    1191      282178 :   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    1192        5147 :     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
    1193           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
    1194           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    1195             :     } else {
    1196        5147 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1197             :     }
    1198      271884 :   } else if (TII->isFLAT(Inst)) {
    1199             :     assert(Inst.mayLoad() || Inst.mayStore());
    1200             : 
    1201        9516 :     if (TII->usesVM_CNT(Inst))
    1202        9516 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1203             : 
    1204        9516 :     if (TII->usesLGKM_CNT(Inst)) {
    1205        7967 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1206             : 
    1207             :       // This is a flat memory operation, so note it - it will require
    1208             :       // that both the VM and LGKM be flushed to zero if it is pending when
    1209             :       // a VM or LGKM dependency occurs.
    1210        7967 :       if (mayAccessLDSThroughFlat(Inst))
    1211             :         ScoreBrackets->setPendingFlat();
    1212             :     }
    1213       39191 :   } else if (SIInstrInfo::isVMEM(Inst) &&
    1214             :              // TODO: get a better carve out.
    1215       78380 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
    1216       78377 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
    1217       39188 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
    1218       39186 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1219             :     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
    1220       55210 :         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
    1221       23162 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    1222             :     }
    1223      223182 :   } else if (TII->isSMRD(Inst)) {
    1224       29400 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1225             :   } else {
    1226      387564 :     switch (Inst.getOpcode()) {
    1227          24 :     case AMDGPU::S_SENDMSG:
    1228             :     case AMDGPU::S_SENDMSGHALT:
    1229          24 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
    1230          24 :       break;
    1231         406 :     case AMDGPU::EXP:
    1232             :     case AMDGPU::EXP_DONE: {
    1233         406 :       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    1234         406 :       if (Imm >= 32 && Imm <= 63)
    1235         120 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    1236         286 :       else if (Imm >= 12 && Imm <= 15)
    1237          33 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    1238             :       else
    1239         253 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
    1240             :       break;
    1241             :     }
    1242           0 :     case AMDGPU::S_MEMTIME:
    1243             :     case AMDGPU::S_MEMREALTIME:
    1244           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1245           0 :       break;
    1246             :     default:
    1247             :       break;
    1248             :     }
    1249             :   }
    1250      277031 : }
    1251             : 
    1252       17021 : void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
    1253       51063 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1254       17021 :   int32_t MaxPending[NUM_INST_CNTS] = {0};
    1255       17021 :   int32_t MaxFlat[NUM_INST_CNTS] = {0};
    1256       17021 :   bool MixedExpTypes = false;
    1257             : 
    1258             :   // Clear the score bracket state.
    1259       17021 :   ScoreBrackets->clear();
    1260             : 
    1261             :   // Compute the number of pending elements on block entry.
    1262             : 
    1263             :   // IMPORTANT NOTE: If iterative handling of loops is added, the code will
    1264             :   // need to handle single BBs with backedges to themselves. This means that
    1265             :   // they will need to retain and not clear their initial state.
    1266             : 
    1267             :   // See if there are any uninitialized predecessors. If so, emit an
    1268             :   // s_waitcnt 0 at the beginning of the block.
    1269       37301 :   for (MachineBasicBlock *pred : Block.predecessors()) {
    1270             :     BlockWaitcntBrackets *PredScoreBrackets =
    1271        9930 :         BlockWaitcntBracketsMap[pred].get();
    1272       16550 :     bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
    1273        3310 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1274             :       break;
    1275             :     }
    1276       22813 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1277        9777 :          T = (enum InstCounterType)(T + 1)) {
    1278             :       int span =
    1279       19554 :           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
    1280       19554 :       MaxPending[T] = std::max(MaxPending[T], span);
    1281        9777 :       span =
    1282        9777 :           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
    1283       19554 :       MaxFlat[T] = std::max(MaxFlat[T], span);
    1284             :     }
    1285             : 
    1286        3259 :     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
    1287             :   }
    1288             : 
    1289             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1290             :   // Also handle kills for exit block.
    1291       31888 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1292        6051 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1293       13433 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1294        5757 :            T = (enum InstCounterType)(T + 1)) {
    1295       28785 :         int Span = KillWaitBrackets[I]->getScoreUB(T) -
    1296       23028 :                    KillWaitBrackets[I]->getScoreLB(T);
    1297       11514 :         MaxPending[T] = std::max(MaxPending[T], Span);
    1298       23028 :         Span = KillWaitBrackets[I]->pendingFlat(T) -
    1299       11514 :                KillWaitBrackets[I]->getScoreLB(T);
    1300       11514 :         MaxFlat[T] = std::max(MaxFlat[T], Span);
    1301             :       }
    1302             : 
    1303        5757 :       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
    1304             :     }
    1305             :   }
    1306             : 
    1307             :   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
    1308       37301 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1309             :     BlockWaitcntBrackets *PredScoreBrackets =
    1310        9930 :         BlockWaitcntBracketsMap[Pred].get();
    1311       16550 :     bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
    1312        3310 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1313             :       break;
    1314             :     }
    1315             : 
    1316        6518 :     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
    1317        6518 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1318        3259 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1319        6518 :     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
    1320        6518 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1321        3259 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1322             :   }
    1323             : 
    1324             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1325       31888 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1326        6051 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1327        9595 :       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
    1328        7676 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1329        1919 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1330        7676 :       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
    1331        5757 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1332        1919 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1333             :     }
    1334             :   }
    1335             : 
    1336             : #if 0
    1337             :   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
    1338             :   // TODO: how does LC distinguish between function entry and main entry?
    1339             :   // If this is the entry to a function, force a wait.
    1340             :   MachineBasicBlock &Entry = Block.getParent()->front();
    1341             :   if (Entry.getNumber() == Block.getNumber()) {
    1342             :     ScoreBrackets->setWaitAtBeginning();
    1343             :     return;
    1344             :   }
    1345             : #endif
    1346             : 
    1347             :   // Now set the current Block's brackets to the largest ending bracket.
    1348      119147 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1349       51063 :        T = (enum InstCounterType)(T + 1)) {
    1350      102126 :     ScoreBrackets->setScoreUB(T, MaxPending[T]);
    1351       51063 :     ScoreBrackets->setScoreLB(T, 0);
    1352      102126 :     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
    1353             :   }
    1354             : 
    1355       34042 :   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
    1356             : 
    1357             :   // Set the register scoreboard.
    1358       37301 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1359       16550 :     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
    1360             :       break;
    1361             :     }
    1362             : 
    1363             :     BlockWaitcntBrackets *PredScoreBrackets =
    1364        9777 :         BlockWaitcntBracketsMap[Pred].get();
    1365             : 
    1366             :     // Now merge the gpr_reg_score information
    1367       13036 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1368        9777 :          T = (enum InstCounterType)(T + 1)) {
    1369        9777 :       int PredLB = PredScoreBrackets->getScoreLB(T);
    1370        9777 :       int PredUB = PredScoreBrackets->getScoreUB(T);
    1371        9777 :       if (PredLB < PredUB) {
    1372        1976 :         int PredScale = MaxPending[T] - PredUB;
    1373             :         // Merge vgpr scores.
    1374       72215 :         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
    1375       70239 :           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
    1376       70239 :           if (PredRegScore <= PredLB)
    1377       62821 :             continue;
    1378        7418 :           int NewRegScore = PredScale + PredRegScore;
    1379       14836 :           ScoreBrackets->setRegScore(
    1380       22254 :               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1381             :         }
    1382             :         // Also need to merge sgpr scores for lgkm_cnt.
    1383        1976 :         if (T == LGKM_CNT) {
    1384       11971 :           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
    1385             :             int PredRegScore =
    1386       11180 :                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1387        5590 :             if (PredRegScore <= PredLB)
    1388        3389 :               continue;
    1389        2201 :             int NewRegScore = PredScale + PredRegScore;
    1390        4402 :             ScoreBrackets->setRegScore(
    1391             :                 J + NUM_ALL_VGPRS, LGKM_CNT,
    1392             :                 std::max(
    1393        6603 :                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1394        2201 :                     NewRegScore));
    1395             :           }
    1396             :         }
    1397             :       }
    1398             :     }
    1399             : 
    1400             :     // Also merge the WaitEvent information.
    1401       68439 :     ForAllWaitEventType(W) {
    1402       65180 :       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
    1403       32590 :       int PredEventUB = PredScoreBrackets->getEventUB(W);
    1404       32590 :       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
    1405             :         int NewEventUB =
    1406        3830 :             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
    1407        1915 :         if (NewEventUB > 0) {
    1408        3830 :           ScoreBrackets->setEventUB(
    1409        5745 :               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1410             :         }
    1411             :       }
    1412             :     }
    1413             :   }
    1414             : 
    1415             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1416             :   // Set the register scoreboard.
    1417       31888 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1418        6051 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1419             :       // Now merge the gpr_reg_score information.
    1420       13433 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1421        5757 :            T = (enum InstCounterType)(T + 1)) {
    1422       23028 :         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
    1423       17271 :         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
    1424        5757 :         if (PredLB < PredUB) {
    1425        1188 :           int PredScale = MaxPending[T] - PredUB;
    1426             :           // Merge vgpr scores.
    1427       17739 :           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
    1428       14175 :             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
    1429        4725 :             if (PredRegScore <= PredLB)
    1430        4254 :               continue;
    1431         471 :             int NewRegScore = PredScale + PredRegScore;
    1432         942 :             ScoreBrackets->setRegScore(
    1433        1413 :                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1434             :           }
    1435             :           // Also need to merge sgpr scores for lgkm_cnt.
    1436        1188 :           if (T == LGKM_CNT) {
    1437       44533 :             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
    1438             :               int PredRegScore =
    1439       31584 :                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1440       10528 :               if (PredRegScore <= PredLB)
    1441        8760 :                 continue;
    1442        1768 :               int NewRegScore = PredScale + PredRegScore;
    1443        3536 :               ScoreBrackets->setRegScore(
    1444             :                   J + NUM_ALL_VGPRS, LGKM_CNT,
    1445             :                   std::max(
    1446        5304 :                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1447        1768 :                       NewRegScore));
    1448             :             }
    1449             :           }
    1450             :         }
    1451             :       }
    1452             : 
    1453             :       // Also merge the WaitEvent information.
    1454       40299 :       ForAllWaitEventType(W) {
    1455       76760 :         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
    1456       57570 :         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
    1457       57570 :         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
    1458             :           int NewEventUB =
    1459        3600 :               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
    1460        1200 :           if (NewEventUB > 0) {
    1461        2400 :             ScoreBrackets->setEventUB(
    1462        3600 :                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1463             :           }
    1464             :         }
    1465             :       }
    1466             :     }
    1467             :   }
    1468             : 
    1469             :   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
    1470             :   // sequencing predecessors, because changes to EXEC require waitcnts due to
    1471             :   // the delayed nature of these operations.
    1472       37301 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1473       16550 :     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
    1474             :       break;
    1475             :     }
    1476             : 
    1477             :     BlockWaitcntBrackets *PredScoreBrackets =
    1478        9777 :         BlockWaitcntBracketsMap[Pred].get();
    1479             : 
    1480        3259 :     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
    1481        3259 :     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1482           0 :       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
    1483           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1484           0 :       if (new_gds_ub > 0) {
    1485           0 :         ScoreBrackets->setEventUB(
    1486             :             GDS_GPR_LOCK,
    1487           0 :             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
    1488             :       }
    1489             :     }
    1490        3259 :     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
    1491        3259 :     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1492           0 :       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
    1493           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1494           0 :       if (new_exp_ub > 0) {
    1495           0 :         ScoreBrackets->setEventUB(
    1496             :             EXP_GPR_LOCK,
    1497           0 :             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
    1498             :       }
    1499             :     }
    1500             :   }
    1501       17021 : }
    1502             : 
    1503             : /// Return the "bottom" block of a loop. This differs from
    1504             : /// MachineLoop::getBottomBlock in that it works even if the loop is
    1505             : /// discontiguous.
    1506             : MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
    1507        2288 :   MachineBasicBlock *Bottom = Loop->getHeader();
    1508        6444 :   for (MachineBasicBlock *MBB : Loop->blocks())
    1509        3012 :     if (MBB->getNumber() > Bottom->getNumber())
    1510        1572 :       Bottom = MBB;
    1511             :   return Bottom;
    1512             : }
    1513             : 
    1514             : // Generate s_waitcnt instructions where needed.
    1515       17021 : void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    1516             :                                             MachineBasicBlock &Block) {
    1517             :   // Initialize the state information.
    1518       17021 :   mergeInputScoreBrackets(Block);
    1519             : 
    1520       51063 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1521             : 
    1522             :   DEBUG({
    1523             :     dbgs() << "Block" << Block.getNumber();
    1524             :     ScoreBrackets->dump();
    1525             :   });
    1526             : 
    1527       17021 :   bool InsertNOP = false;
    1528             : 
    1529             :   // Walk over the instructions.
    1530       34042 :   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
    1531      285998 :        Iter != E;) {
    1532      277274 :     MachineInstr &Inst = *Iter;
    1533             :     // Remove any previously existing waitcnts.
    1534      277517 :     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
    1535             :       // TODO: Register the old waitcnt and optimize the following waitcnts.
    1536             :       // Leaving the previously existing waitcnts is conservatively correct.
    1537         729 :       if (CompilerGeneratedWaitcntSet.find(&Inst) ==
    1538         486 :           CompilerGeneratedWaitcntSet.end())
    1539             :         ++Iter;
    1540             :       else {
    1541         466 :         ScoreBrackets->setWaitcnt(&Inst);
    1542         233 :         ++Iter;
    1543         233 :         Inst.removeFromParent();
    1544             :       }
    1545         243 :       continue;
    1546             :     }
    1547             : 
    1548             :     // Kill instructions generate a conditional branch to the endmain block.
    1549             :     // Merge the current waitcnt state into the endmain block information.
    1550             :     // TODO: Are there other flavors of KILL instruction?
    1551      277031 :     if (Inst.getOpcode() == AMDGPU::KILL) {
    1552         270 :       addKillWaitBracket(ScoreBrackets);
    1553             :     }
    1554             : 
    1555      277031 :     bool VCCZBugWorkAround = false;
    1556         432 :     if (readsVCCZ(Inst) &&
    1557        1080 :         (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
    1558         216 :       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1559         216 :               ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1560             :           ScoreBrackets->hasPendingSMEM()) {
    1561           8 :         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
    1562           7 :           VCCZBugWorkAround = true;
    1563             :       }
    1564             :     }
    1565             : 
    1566             :     // Generate an s_waitcnt instruction to be placed before
    1567             :     // cur_Inst, if needed.
    1568      277031 :     MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
    1569             : 
    1570      277031 :     if (SWaitInst) {
    1571       68356 :       Block.insert(Inst, SWaitInst);
    1572             :       if (ScoreBrackets->getWaitcnt() != SWaitInst) {
    1573             :         DEBUG(dbgs() << "insertWaitcntInBlock\n"
    1574             :                      << "Old Instr: " << Inst << '\n'
    1575             :                      << "New Instr: " << *SWaitInst << '\n';);
    1576             :       }
    1577             :     }
    1578             : 
    1579      277031 :     updateEventWaitCntAfter(Inst, ScoreBrackets);
    1580             : 
    1581             : #if 0 // TODO: implement resource type check controlled by options with ub = LB.
    1582             :     // If this instruction generates a S_SETVSKIP because it is an
    1583             :     // indexed resource, and we are on Tahiti, then it will also force
    1584             :     // an S_WAITCNT vmcnt(0)
    1585             :     if (RequireCheckResourceType(Inst, context)) {
    1586             :       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
    1587             :       ScoreBrackets->setScoreLB(VM_CNT,
    1588             :       ScoreBrackets->getScoreUB(VM_CNT));
    1589             :     }
    1590             : #endif
    1591             : 
    1592             :     ScoreBrackets->clearWaitcnt();
    1593             : 
    1594             :     if (SWaitInst) {
    1595             :       DEBUG({ SWaitInst->print(dbgs() << '\n'); });
    1596             :     }
    1597             :     DEBUG({
    1598             :       Inst.print(dbgs());
    1599             :       ScoreBrackets->dump();
    1600             :     });
    1601             : 
    1602             :     // Check to see if this is a GWS instruction. If so, and if this is CI or
    1603             :     // VI, then the generated code sequence will include an S_WAITCNT 0.
    1604             :     // TODO: Are these the only GWS instructions?
    1605      277031 :     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
    1606      277031 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
    1607      277031 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
    1608      554062 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
    1609             :         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
    1610             :       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
    1611           0 :       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1612           0 :       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1613           0 :       ScoreBrackets->updateByWait(LGKM_CNT,
    1614             :                                   ScoreBrackets->getScoreUB(LGKM_CNT));
    1615             :     }
    1616             : 
    1617             :     // TODO: Remove this work-around after fixing the scheduler and enable the
    1618             :     // assert above.
    1619      277031 :     if (VCCZBugWorkAround) {
    1620             :       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    1621             :       // bit is updated, so we can restore the bit by reading the value of
    1622             :       // vcc and then writing it back to the register.
    1623          21 :       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    1624          21 :               AMDGPU::VCC)
    1625           7 :           .addReg(AMDGPU::VCC);
    1626          14 :       VCCZBugHandledSet.insert(&Inst);
    1627             :     }
    1628             : 
    1629      277031 :     if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    1630             : 
    1631             :       // This avoids a s_nop after a waitcnt has just been inserted.
    1632      137170 :       if (!SWaitInst && InsertNOP) {
    1633        1590 :         BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
    1634             :       }
    1635      137170 :       InsertNOP = false;
    1636             : 
    1637             :       // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
    1638             :       // or SMEM clause, respectively.
    1639             :       //
    1640             :       // The temporary workaround is to break the clauses with S_NOP.
    1641             :       //
    1642             :       // The proper solution would be to allocate registers such that all source
    1643             :       // and destination registers don't overlap, e.g. this is illegal:
    1644             :       //   r0 = load r2
    1645             :       //   r2 = load r0
    1646      137170 :       bool IsSMEM = false;
    1647      137170 :       bool IsVMEM = false;
    1648      137170 :       if (TII->isSMRD(Inst))
    1649             :         IsSMEM = true;
    1650      122448 :       else if (TII->usesVM_CNT(Inst))
    1651       25048 :         IsVMEM = true;
    1652             : 
    1653      137170 :       ++Iter;
    1654      137170 :       if (Iter == E)
    1655             :         break;
    1656             : 
    1657      128873 :       MachineInstr &Next = *Iter;
    1658             : 
    1659             :       // TODO: How about consecutive SMEM instructions?
    1660             :       //       The comments above says break the clause but the code does not.
    1661             :       // if ((TII->isSMRD(next) && isSMEM) ||
    1662      256259 :       if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
    1663             :           // TODO: Enable this check when hasSoftClause is upstreamed.
    1664             :           // ST->hasSoftClauses() &&
    1665       12914 :           ST->isXNACKEnabled()) {
    1666             :         // Insert a NOP to break the clause.
    1667         311 :         InsertNOP = true;
    1668         311 :         continue;
    1669             :       }
    1670             : 
    1671             :       // There must be "S_NOP 0" between an instruction writing M0 and
    1672             :       // S_SENDMSG.
    1673      128556 :       if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
    1674      128574 :            Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
    1675          12 :           Inst.definesRegister(AMDGPU::M0))
    1676             :         InsertNOP = true;
    1677             : 
    1678      128562 :       continue;
    1679             :     }
    1680             : 
    1681             :     ++Iter;
    1682             :   }
    1683             : 
    1684             :   // Check if we need to force convergence at loop footer.
    1685       34042 :   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
    1686       17593 :   if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
    1687        1062 :     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1688             :     WaitcntData->print();
    1689             :     DEBUG(dbgs() << '\n';);
    1690             : 
    1691             :     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
    1692             :     // placement and doesn't always guarantee convergence for a loop. Each
    1693             :     // loop should take at most 2 iterations for it to converge naturally.
    1694             :     // When this max is reached and result doesn't converge, we force
    1695             :     // convergence by inserting a s_waitcnt at the end of loop footer.
    1696         354 :     if (WaitcntData->getIterCnt() > 2) {
    1697             :       // To ensure convergence, need to make wait events at loop footer be no
    1698             :       // more than those from the previous iteration.
    1699             :       // As a simplification, Instead of tracking individual scores and
    1700             :       // generate the precise wait count, just wait on 0.
    1701           0 :       bool HasPending = false;
    1702           0 :       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
    1703           0 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1704           0 :            T = (enum InstCounterType)(T + 1)) {
    1705           0 :         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    1706           0 :           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1707           0 :           HasPending = true;
    1708             :         }
    1709             :       }
    1710             : 
    1711           0 :       if (HasPending) {
    1712           0 :         if (!SWaitInst) {
    1713           0 :           SWaitInst = Block.getParent()->CreateMachineInstr(
    1714           0 :               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
    1715           0 :           CompilerGeneratedWaitcntSet.insert(SWaitInst);
    1716           0 :           const MachineOperand &Op = MachineOperand::CreateImm(0);
    1717           0 :           SWaitInst->addOperand(MF, Op);
    1718             : #if 0 // TODO: Format the debug output
    1719             :           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
    1720             :           OutputTransformAdd(SWaitInst, context);
    1721             : #endif
    1722             :         }
    1723             : #if 0 // TODO: ??
    1724             :         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
    1725             : #endif
    1726             :       }
    1727             : 
    1728           0 :       if (SWaitInst) {
    1729             :         DEBUG({
    1730             :           SWaitInst->print(dbgs());
    1731             :           dbgs() << "\nAdjusted score board:";
    1732             :           ScoreBrackets->dump();
    1733             :         });
    1734             : 
    1735             :         // Add this waitcnt to the block. It is either newly created or
    1736             :         // created in previous iterations and added back since block traversal
    1737             :         // always remove waitcnt.
    1738           0 :         insertWaitcntBeforeCF(Block, SWaitInst);
    1739           0 :         WaitcntData->setWaitcnt(SWaitInst);
    1740             :       }
    1741             :     }
    1742             :   }
    1743       17021 : }
    1744             : 
    1745       14832 : bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    1746       14832 :   ST = &MF.getSubtarget<SISubtarget>();
    1747       29664 :   TII = ST->getInstrInfo();
    1748       29664 :   TRI = &TII->getRegisterInfo();
    1749       14832 :   MRI = &MF.getRegInfo();
    1750       14832 :   MLI = &getAnalysis<MachineLoopInfo>();
    1751       29664 :   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
    1752       14832 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1753       29664 :   AMDGPUASI = ST->getAMDGPUAS();
    1754             : 
    1755       14832 :   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
    1756       14832 :   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
    1757       14832 :   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
    1758             : 
    1759       29664 :   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
    1760       29664 :   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
    1761             :   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
    1762             :   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
    1763             : 
    1764       29664 :   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
    1765       14832 :   RegisterEncoding.VGPRL =
    1766       14832 :       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
    1767       29664 :   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
    1768       14832 :   RegisterEncoding.SGPRL =
    1769       14832 :       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
    1770             : 
    1771             :   // Walk over the blocks in reverse post-dominator order, inserting
    1772             :   // s_waitcnt where needed.
    1773       29664 :   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    1774       14832 :   bool Modified = false;
    1775             :   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1776       14832 :            I = RPOT.begin(),
    1777       14832 :            E = RPOT.end(), J = RPOT.begin();
    1778       31853 :        I != E;) {
    1779       17021 :     MachineBasicBlock &MBB = **I;
    1780             : 
    1781       34042 :     BlockVisitedSet.insert(&MBB);
    1782             : 
    1783       51063 :     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1784       17021 :     if (!ScoreBrackets) {
    1785       22504 :       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
    1786       16878 :       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1787             :     }
    1788       34042 :     ScoreBrackets->setPostOrder(MBB.getNumber());
    1789       34042 :     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
    1790       18165 :     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
    1791         332 :       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
    1792             : 
    1793             :     // If we are walking into the block from before the loop, then guarantee
    1794             :     // at least 1 re-walk over the loop to propagate the information, even if
    1795             :     // no S_WAITCNT instructions were generated.
    1796       18125 :     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
    1797         534 :         (BlockWaitcntProcessedSet.find(&MBB) ==
    1798         356 :          BlockWaitcntProcessedSet.end())) {
    1799         604 :       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
    1800             :       DEBUG(dbgs() << "set-revisit: block"
    1801             :                    << ContainingLoop->getHeader()->getNumber() << '\n';);
    1802             :     }
    1803             : 
    1804             :     // Walk over the instructions.
    1805       17021 :     insertWaitcntInBlock(MF, MBB);
    1806             : 
    1807             :     // Flag that waitcnts have been processed at least once.
    1808       34042 :     BlockWaitcntProcessedSet.insert(&MBB);
    1809             : 
    1810             :     // See if we want to revisit the loop.
    1811       17593 :     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
    1812         708 :       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
    1813        1062 :       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
    1814         354 :       if (EntrySB && EntrySB->getRevisitLoop()) {
    1815         176 :         EntrySB->setRevisitLoop(false);
    1816         176 :         J = I;
    1817         176 :         int32_t PostOrder = EntrySB->getPostOrder();
    1818             :         // TODO: Avoid this loop. Find another way to set I.
    1819             :         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1820         176 :                  X = RPOT.begin(),
    1821             :                  Y = RPOT.end();
    1822         436 :              X != Y; ++X) {
    1823         436 :           MachineBasicBlock &MBBX = **X;
    1824         436 :           if (MBBX.getNumber() == PostOrder) {
    1825             :             I = X;
    1826             :             break;
    1827             :           }
    1828             :         }
    1829         528 :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1830             :         WaitcntData->incIterCnt();
    1831             :         DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
    1832         176 :         continue;
    1833             :       } else {
    1834         534 :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1835             :         // Loop converged, reset iteration count. If this loop gets revisited,
    1836             :         // it must be from an outer loop, the counter will restart, this will
    1837             :         // ensure we don't force convergence on such revisits.
    1838             :         WaitcntData->resetIterCnt();
    1839             :       }
    1840             :     }
    1841             : 
    1842       16845 :     J = I;
    1843       16845 :     ++I;
    1844             :   }
    1845             : 
    1846       29664 :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
    1847             : 
    1848       14832 :   bool HaveScalarStores = false;
    1849             : 
    1850       29664 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
    1851             :        ++BI) {
    1852       16722 :     MachineBasicBlock &MBB = *BI;
    1853             : 
    1854      351001 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
    1855             :          ++I) {
    1856      902005 :       if (!HaveScalarStores && TII->isScalarStore(*I))
    1857             :         HaveScalarStores = true;
    1858             : 
    1859      888588 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
    1860      286918 :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
    1861       14180 :         EndPgmBlocks.push_back(&MBB);
    1862             :     }
    1863             :   }
    1864             : 
    1865       14832 :   if (HaveScalarStores) {
    1866             :     // If scalar writes are used, the cache must be flushed or else the next
    1867             :     // wave to reuse the same scratch memory can be clobbered.
    1868             :     //
    1869             :     // Insert s_dcache_wb at wave termination points if there were any scalar
    1870             :     // stores, and only if the cache hasn't already been flushed. This could be
    1871             :     // improved by looking across blocks for flushes in postdominating blocks
    1872             :     // from the stores but an explicitly requested flush is probably very rare.
    1873          32 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
    1874           8 :       bool SeenDCacheWB = false;
    1875             : 
    1876         141 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
    1877             :            ++I) {
    1878         234 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
    1879             :           SeenDCacheWB = true;
    1880         234 :         else if (TII->isScalarStore(*I))
    1881           1 :           SeenDCacheWB = false;
    1882             : 
    1883             :         // FIXME: It would be better to insert this before a waitcnt if any.
    1884         226 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
    1885         234 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
    1886             :             !SeenDCacheWB) {
    1887           8 :           Modified = true;
    1888          32 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
    1889             :         }
    1890             :       }
    1891             :     }
    1892             :   }
    1893             : 
    1894       14832 :   if (!MFI->isEntryFunction()) {
    1895             :     // Wait for any outstanding memory operations that the input registers may
    1896             :     // depend on. We can't track them and it's better to to the wait after the
    1897             :     // costly call sequence.
    1898             : 
    1899             :     // TODO: Could insert earlier and schedule more liberally with operations
    1900             :     // that only use caller preserved registers.
    1901         663 :     MachineBasicBlock &EntryBB = MF.front();
    1902        3315 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1903         663 :       .addImm(0);
    1904             : 
    1905         663 :     Modified = true;
    1906             :   }
    1907             : 
    1908       29664 :   return Modified;
    1909             : }

Generated by: LCOV version 1.13