LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertWaitcnts.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 512 607 84.3 %
Date: 2018-02-23 15:42:53 Functions: 23 25 92.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Insert wait instructions for memory reads and writes.
      12             : ///
      13             : /// Memory reads and writes are issued asynchronously, so we need to insert
      14             : /// S_WAITCNT instructions when we want to access any of their results or
      15             : /// overwrite any register that's used asynchronously.
      16             : //
      17             : //===----------------------------------------------------------------------===//
      18             : 
      19             : #include "AMDGPU.h"
      20             : #include "AMDGPUSubtarget.h"
      21             : #include "SIDefines.h"
      22             : #include "SIInstrInfo.h"
      23             : #include "SIMachineFunctionInfo.h"
      24             : #include "SIRegisterInfo.h"
      25             : #include "Utils/AMDGPUBaseInfo.h"
      26             : #include "llvm/ADT/DenseMap.h"
      27             : #include "llvm/ADT/DenseSet.h"
      28             : #include "llvm/ADT/PostOrderIterator.h"
      29             : #include "llvm/ADT/STLExtras.h"
      30             : #include "llvm/ADT/SmallVector.h"
      31             : #include "llvm/CodeGen/MachineBasicBlock.h"
      32             : #include "llvm/CodeGen/MachineFunction.h"
      33             : #include "llvm/CodeGen/MachineFunctionPass.h"
      34             : #include "llvm/CodeGen/MachineInstr.h"
      35             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      36             : #include "llvm/CodeGen/MachineLoopInfo.h"
      37             : #include "llvm/CodeGen/MachineMemOperand.h"
      38             : #include "llvm/CodeGen/MachineOperand.h"
      39             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      40             : #include "llvm/IR/DebugLoc.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Debug.h"
      43             : #include "llvm/Support/ErrorHandling.h"
      44             : #include "llvm/Support/raw_ostream.h"
      45             : #include <algorithm>
      46             : #include <cassert>
      47             : #include <cstdint>
      48             : #include <cstring>
      49             : #include <memory>
      50             : #include <utility>
      51             : #include <vector>
      52             : 
      53             : #define DEBUG_TYPE "si-insert-waitcnts"
      54             : 
      55             : using namespace llvm;
      56             : 
      57             : namespace {
      58             : 
      59             : // Class of object that encapsulates latest instruction counter score
      60             : // associated with the operand.  Used for determining whether
      61             : // s_waitcnt instruction needs to be emited.
      62             : 
      63             : #define CNT_MASK(t) (1u << (t))
      64             : 
      65             : enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
      66             : 
      67             : using RegInterval = std::pair<signed, signed>;
      68             : 
      69             : struct {
      70             :   int32_t VmcntMax;
      71             :   int32_t ExpcntMax;
      72             :   int32_t LgkmcntMax;
      73             :   int32_t NumVGPRsMax;
      74             :   int32_t NumSGPRsMax;
      75             : } HardwareLimits;
      76             : 
      77             : struct {
      78             :   unsigned VGPR0;
      79             :   unsigned VGPRL;
      80             :   unsigned SGPR0;
      81             :   unsigned SGPRL;
      82             : } RegisterEncoding;
      83             : 
      84             : enum WaitEventType {
      85             :   VMEM_ACCESS,      // vector-memory read & write
      86             :   LDS_ACCESS,       // lds read & write
      87             :   GDS_ACCESS,       // gds read & write
      88             :   SQ_MESSAGE,       // send message
      89             :   SMEM_ACCESS,      // scalar-memory read & write
      90             :   EXP_GPR_LOCK,     // export holding on its data src
      91             :   GDS_GPR_LOCK,     // GDS holding on its data and addr src
      92             :   EXP_POS_ACCESS,   // write to export position
      93             :   EXP_PARAM_ACCESS, // write to export parameter
      94             :   VMW_GPR_LOCK,     // vector-memory write holding on its data src
      95             :   NUM_WAIT_EVENTS,
      96             : };
      97             : 
      98             : // The mapping is:
      99             : //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
     100             : //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
     101             : //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
     102             : // We reserve a fixed number of VGPR slots in the scoring tables for
     103             : // special tokens like SCMEM_LDS (needed for buffer load to LDS).
     104             : enum RegisterMapping {
     105             :   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
     106             :   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
     107             :   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
     108             :   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
     109             :   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
     110             : };
     111             : 
     112             : #define ForAllWaitEventType(w)                                                 \
     113             :   for (enum WaitEventType w = (enum WaitEventType)0;                           \
     114             :        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
     115             :        (w) = (enum WaitEventType)((w) + 1))
     116             : 
     117             : // This is a per-basic-block object that maintains current score brackets
     118             : // of each wait-counter, and a per-register scoreboard for each wait-couner.
     119             : // We also maintain the latest score for every event type that can change the
     120             : // waitcnt in order to know if there are multiple types of events within
     121             : // the brackets. When multiple types of event happen in the bracket,
     122             : // wait-count may get decreased out of order, therefore we need to put in
     123             : // "s_waitcnt 0" before use.
     124             : class BlockWaitcntBrackets {
     125             : public:
     126        7097 :   BlockWaitcntBrackets() {
     127       49679 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     128       21291 :          T = (enum InstCounterType)(T + 1)) {
     129       21291 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     130             :     }
     131        7097 :   }
     132             : 
     133             :   ~BlockWaitcntBrackets() = default;
     134             : 
     135             :   static int32_t getWaitCountMax(InstCounterType T) {
     136       40062 :     switch (T) {
     137       14863 :     case VM_CNT:
     138       14863 :       return HardwareLimits.VmcntMax;
     139       20316 :     case LGKM_CNT:
     140       20316 :       return HardwareLimits.LgkmcntMax;
     141        4883 :     case EXP_CNT:
     142       29026 :       return HardwareLimits.ExpcntMax;
     143             :     default:
     144             :       break;
     145             :     }
     146             :     return 0;
     147             :   }
     148             : 
     149             :   void setScoreLB(InstCounterType T, int32_t Val) {
     150             :     assert(T < NUM_INST_CNTS);
     151       42416 :     if (T >= NUM_INST_CNTS)
     152             :       return;
     153      100733 :     ScoreLBs[T] = Val;
     154             :   }
     155             : 
     156             :   void setScoreUB(InstCounterType T, int32_t Val) {
     157             :     assert(T < NUM_INST_CNTS);
     158             :     if (T >= NUM_INST_CNTS)
     159             :       return;
     160      178667 :     ScoreUBs[T] = Val;
     161      121799 :     if (T == EXP_CNT) {
     162       24143 :       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
     163       24143 :       if (ScoreLBs[T] < UB)
     164        4458 :         ScoreLBs[T] = UB;
     165             :     }
     166             :   }
     167             : 
     168             :   int32_t getScoreLB(InstCounterType T) {
     169             :     assert(T < NUM_INST_CNTS);
     170     2631181 :     if (T >= NUM_INST_CNTS)
     171             :       return 0;
     172     2717724 :     return ScoreLBs[T];
     173             :   }
     174             : 
     175             :   int32_t getScoreUB(InstCounterType T) {
     176             :     assert(T < NUM_INST_CNTS);
     177     2698730 :     if (T >= NUM_INST_CNTS)
     178             :       return 0;
     179     2780918 :     return ScoreUBs[T];
     180             :   }
     181             : 
     182             :   // Mapping from event to counter.
     183             :   InstCounterType eventCounter(WaitEventType E) {
     184      176049 :     switch (E) {
     185             :     case VMEM_ACCESS:
     186             :       return VM_CNT;
     187       68432 :     case LDS_ACCESS:
     188             :     case GDS_ACCESS:
     189             :     case SQ_MESSAGE:
     190             :     case SMEM_ACCESS:
     191             :       return LGKM_CNT;
     192       51268 :     case EXP_GPR_LOCK:
     193             :     case GDS_GPR_LOCK:
     194             :     case VMW_GPR_LOCK:
     195             :     case EXP_POS_ACCESS:
     196             :     case EXP_PARAM_ACCESS:
     197             :       return EXP_CNT;
     198           0 :     default:
     199           0 :       llvm_unreachable("unhandled event type");
     200             :     }
     201             :     return NUM_INST_CNTS;
     202             :   }
     203             : 
     204             :   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
     205      168587 :     if (GprNo < NUM_ALL_VGPRS) {
     206      107805 :       if (GprNo > VgprUB) {
     207       17455 :         VgprUB = GprNo;
     208             :       }
     209      107805 :       VgprScores[T][GprNo] = Val;
     210             :     } else {
     211             :       assert(T == LGKM_CNT);
     212       64632 :       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
     213       15898 :         SgprUB = GprNo - NUM_ALL_VGPRS;
     214             :       }
     215       64632 :       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     216             :     }
     217             :   }
     218             : 
     219             :   int32_t getRegScore(int GprNo, InstCounterType T) {
     220     2654891 :     if (GprNo < NUM_ALL_VGPRS) {
     221     1465858 :       return VgprScores[T][GprNo];
     222             :     }
     223     1209960 :     return SgprScores[GprNo - NUM_ALL_VGPRS];
     224             :   }
     225             : 
     226       18956 :   void clear() {
     227       18956 :     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     228       18956 :     memset(ScoreUBs, 0, sizeof(ScoreUBs));
     229       18956 :     memset(EventUBs, 0, sizeof(EventUBs));
     230      132692 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     231       56868 :          T = (enum InstCounterType)(T + 1)) {
     232       56868 :       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     233             :     }
     234       18956 :     memset(SgprScores, 0, sizeof(SgprScores));
     235       18956 :   }
     236             : 
     237             :   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
     238             :                              const MachineRegisterInfo *MRI,
     239             :                              const SIRegisterInfo *TRI, unsigned OpNo,
     240             :                              bool Def) const;
     241             : 
     242             :   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
     243             :                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
     244             :                    unsigned OpNo, int32_t Val);
     245             : 
     246             :   void setWaitAtBeginning() { WaitAtBeginning = true; }
     247           0 :   void clearWaitAtBeginning() { WaitAtBeginning = false; }
     248             :   bool getWaitAtBeginning() const { return WaitAtBeginning; }
     249        3221 :   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
     250             :   int32_t getMaxVGPR() const { return VgprUB; }
     251             :   int32_t getMaxSGPR() const { return SgprUB; }
     252             : 
     253             :   int32_t getEventUB(enum WaitEventType W) const {
     254             :     assert(W < NUM_WAIT_EVENTS);
     255       83801 :     return EventUBs[W];
     256             :   }
     257             : 
     258             :   bool counterOutOfOrder(InstCounterType T);
     259             :   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
     260             :   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
     261             :                      const MachineRegisterInfo *MRI, WaitEventType E,
     262             :                      MachineInstr &MI);
     263             : 
     264             :   bool hasPendingSMEM() const {
     265          43 :     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     266             :             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
     267             :   }
     268             : 
     269             :   bool hasPendingFlat() const {
     270       16291 :     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
     271       30392 :              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
     272       14881 :             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
     273          50 :              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
     274             :   }
     275             : 
     276             :   void setPendingFlat() {
     277        1569 :     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
     278        1569 :     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
     279             :   }
     280             : 
     281       16275 :   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
     282             : 
     283       56868 :   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
     284             : 
     285             :   bool getRevisitLoop() const { return RevisitLoop; }
     286         597 :   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
     287             : 
     288       18956 :   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
     289             :   int32_t getPostOrder() const { return PostOrder; }
     290             : 
     291        2559 :   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
     292      291733 :   void clearWaitcnt() { Waitcnt = nullptr; }
     293             :   MachineInstr *getWaitcnt() const { return Waitcnt; }
     294             : 
     295             :   bool mixedExpTypes() const { return MixedExpTypes; }
     296             :   void setMixedExpTypes(bool MixedExpTypesIn) {
     297       23491 :     MixedExpTypes = MixedExpTypesIn;
     298             :   }
     299             : 
     300             :   void print(raw_ostream &);
     301             :   void dump() { print(dbgs()); }
     302             : 
     303             : private:
     304             :   bool WaitAtBeginning = false;
     305             :   bool RevisitLoop = false;
     306             :   bool MixedExpTypes = false;
     307             :   int32_t PostOrder = 0;
     308             :   MachineInstr *Waitcnt = nullptr;
     309             :   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
     310             :   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
     311             :   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
     312             :   // Remember the last flat memory operation.
     313             :   int32_t LastFlat[NUM_INST_CNTS] = {0};
     314             :   // wait_cnt scores for every vgpr.
     315             :   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
     316             :   int32_t VgprUB = 0;
     317             :   int32_t SgprUB = 0;
     318             :   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
     319             :   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
     320             :   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
     321             : };
     322             : 
     323             : // This is a per-loop-region object that records waitcnt status at the end of
     324             : // loop footer from the previous iteration. We also maintain an iteration
     325             : // count to track the number of times the loop has been visited. When it
     326             : // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
     327             : // at the end of the loop footer.
     328             : class LoopWaitcntData {
     329             : public:
     330             :   LoopWaitcntData() = default;
     331             :   ~LoopWaitcntData() = default;
     332             : 
     333         193 :   void incIterCnt() { IterCnt++; }
     334         199 :   void resetIterCnt() { IterCnt = 0; }
     335             :   int32_t getIterCnt() { return IterCnt; }
     336             : 
     337           0 :   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
     338             :   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
     339             : 
     340             :   void print() {
     341             :     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
     342             :   }
     343             : 
     344             : private:
     345             :   // s_waitcnt added at the end of loop footer to stablize wait scores
     346             :   // at the end of the loop footer.
     347             :   MachineInstr *LfWaitcnt = nullptr;
     348             :   // Number of iterations the loop has been visited, not including the initial
     349             :   // walk over.
     350             :   int32_t IterCnt = 0;
     351             : };
     352             : 
     353        6716 : class SIInsertWaitcnts : public MachineFunctionPass {
     354             : private:
     355             :   const SISubtarget *ST = nullptr;
     356             :   const SIInstrInfo *TII = nullptr;
     357             :   const SIRegisterInfo *TRI = nullptr;
     358             :   const MachineRegisterInfo *MRI = nullptr;
     359             :   const MachineLoopInfo *MLI = nullptr;
     360             :   AMDGPU::IsaInfo::IsaVersion IV;
     361             :   AMDGPUAS AMDGPUASI;
     362             : 
     363             :   DenseSet<MachineBasicBlock *> BlockVisitedSet;
     364             :   DenseSet<MachineInstr *> TrackedWaitcntSet;
     365             :   DenseSet<MachineInstr *> VCCZBugHandledSet;
     366             : 
     367             :   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
     368             :       BlockWaitcntBracketsMap;
     369             : 
     370             :   DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
     371             : 
     372             :   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
     373             : 
     374             :   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
     375             : 
     376             : public:
     377             :   static char ID;
     378             : 
     379        6748 :   SIInsertWaitcnts() : MachineFunctionPass(ID) {}
     380             : 
     381             :   bool runOnMachineFunction(MachineFunction &MF) override;
     382             : 
     383        1680 :   StringRef getPassName() const override {
     384        1680 :     return "SI insert wait instructions";
     385             :   }
     386             : 
     387        1680 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     388        1680 :     AU.setPreservesCFG();
     389             :     AU.addRequired<MachineLoopInfo>();
     390        1680 :     MachineFunctionPass::getAnalysisUsage(AU);
     391        1680 :   }
     392             : 
     393         271 :   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     394             :     // The waitcnt information is copied because it changes as the block is
     395             :     // traversed.
     396         271 :     KillWaitBrackets.push_back(
     397         542 :         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
     398         271 :   }
     399             : 
     400             :   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
     401             :   void generateSWaitCntInstBefore(MachineInstr &MI,
     402             :                                   BlockWaitcntBrackets *ScoreBrackets);
     403             :   void updateEventWaitCntAfter(MachineInstr &Inst,
     404             :                                BlockWaitcntBrackets *ScoreBrackets);
     405             :   void mergeInputScoreBrackets(MachineBasicBlock &Block);
     406             :   MachineBasicBlock *loopBottom(const MachineLoop *Loop);
     407             :   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
     408             :   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
     409             :   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
     410             :   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
     411             : };
     412             : 
     413             : } // end anonymous namespace
     414             : 
     415     3251027 : RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
     416             :                                                  const SIInstrInfo *TII,
     417             :                                                  const MachineRegisterInfo *MRI,
     418             :                                                  const SIRegisterInfo *TRI,
     419             :                                                  unsigned OpNo,
     420             :                                                  bool Def) const {
     421     3251027 :   const MachineOperand &Op = MI->getOperand(OpNo);
     422     5330069 :   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
     423     1163132 :       (Def && !Op.isDef()))
     424     2053382 :     return {-1, -1};
     425             : 
     426             :   // A use via a PW operand does not need a waitcnt.
     427             :   // A partial write is not a WAW.
     428             :   assert(!Op.getSubReg() || !Op.isUndef());
     429             : 
     430             :   RegInterval Result;
     431             :   const MachineRegisterInfo &MRIA = *MRI;
     432             : 
     433     2395290 :   unsigned Reg = TRI->getEncodingValue(Op.getReg());
     434             : 
     435     1197645 :   if (TRI->isVGPR(MRIA, Op.getReg())) {
     436             :     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     437      518571 :     Result.first = Reg - RegisterEncoding.VGPR0;
     438             :     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     439      679074 :   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
     440             :     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     441      679074 :     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     442             :     assert(Result.first >= NUM_ALL_VGPRS &&
     443             :            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
     444             :   }
     445             :   // TODO: Handle TTMP
     446             :   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
     447             :   else
     448           0 :     return {-1, -1};
     449             : 
     450             :   const MachineInstr &MIA = *MI;
     451     1197645 :   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
     452             :   unsigned Size = TRI->getRegSizeInBits(*RC);
     453     1197645 :   Result.second = Result.first + (Size / 32);
     454             : 
     455     1197645 :   return Result;
     456             : }
     457             : 
     458       25472 : void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
     459             :                                        const SIInstrInfo *TII,
     460             :                                        const SIRegisterInfo *TRI,
     461             :                                        const MachineRegisterInfo *MRI,
     462             :                                        unsigned OpNo, int32_t Val) {
     463       25472 :   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
     464             :   DEBUG({
     465             :     const MachineOperand &Opnd = MI->getOperand(OpNo);
     466             :     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
     467             :   });
     468       68055 :   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     469             :     setRegScore(RegNo, EXP_CNT, Val);
     470             :   }
     471       25472 : }
     472             : 
     473      121799 : void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     474             :                                          const SIRegisterInfo *TRI,
     475             :                                          const MachineRegisterInfo *MRI,
     476             :                                          WaitEventType E, MachineInstr &Inst) {
     477             :   const MachineRegisterInfo &MRIA = *MRI;
     478             :   InstCounterType T = eventCounter(E);
     479      121799 :   int32_t CurrScore = getScoreUB(T) + 1;
     480             :   // EventUB and ScoreUB need to be update regardless if this event changes
     481             :   // the score of a register or not.
     482             :   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
     483      121799 :   EventUBs[E] = CurrScore;
     484             :   setScoreUB(T, CurrScore);
     485             : 
     486      121799 :   if (T == EXP_CNT) {
     487             :     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
     488             :     // is required.
     489       24143 :     if (!MixedExpTypes) {
     490       24143 :       MixedExpTypes = counterOutOfOrder(EXP_CNT);
     491             :     }
     492             : 
     493             :     // Put score on the source vgprs. If this is a store, just use those
     494             :     // specific register(s).
     495       24143 :     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
     496             :       // All GDS operations must protect their address register (same as
     497             :       // export.)
     498           0 :       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
     499             :           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
     500           0 :         setExpScore(
     501             :             &Inst, TII, TRI, MRI,
     502           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
     503             :             CurrScore);
     504             :       }
     505           0 :       if (Inst.mayStore()) {
     506           0 :         setExpScore(
     507             :             &Inst, TII, TRI, MRI,
     508           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
     509             :             CurrScore);
     510           0 :         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     511             :                                        AMDGPU::OpName::data1) != -1) {
     512           0 :           setExpScore(&Inst, TII, TRI, MRI,
     513             :                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
     514             :                                                  AMDGPU::OpName::data1),
     515             :                       CurrScore);
     516             :         }
     517           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
     518           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
     519           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
     520           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
     521           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
     522           0 :                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
     523           0 :                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
     524           0 :                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
     525             :                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
     526           0 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     527           0 :           const MachineOperand &Op = Inst.getOperand(I);
     528           0 :           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
     529           0 :             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     530             :           }
     531             :         }
     532             :       }
     533       24143 :     } else if (TII->isFLAT(Inst)) {
     534           0 :       if (Inst.mayStore()) {
     535           0 :         setExpScore(
     536             :             &Inst, TII, TRI, MRI,
     537           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     538             :             CurrScore);
     539           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     540           0 :         setExpScore(
     541             :             &Inst, TII, TRI, MRI,
     542           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     543             :             CurrScore);
     544             :       }
     545       24143 :     } else if (TII->isMIMG(Inst)) {
     546          70 :       if (Inst.mayStore()) {
     547          70 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     548           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     549           0 :         setExpScore(
     550             :             &Inst, TII, TRI, MRI,
     551           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     552             :             CurrScore);
     553             :       }
     554       24073 :     } else if (TII->isMTBUF(Inst)) {
     555          55 :       if (Inst.mayStore()) {
     556          55 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     557             :       }
     558       24018 :     } else if (TII->isMUBUF(Inst)) {
     559       23575 :       if (Inst.mayStore()) {
     560       23575 :         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
     561           0 :       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
     562           0 :         setExpScore(
     563             :             &Inst, TII, TRI, MRI,
     564           0 :             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
     565             :             CurrScore);
     566             :       }
     567             :     } else {
     568         443 :       if (TII->isEXP(Inst)) {
     569             :         // For export the destination registers are really temps that
     570             :         // can be used as the actual source after export patching, so
     571             :         // we need to treat them like sources and set the EXP_CNT
     572             :         // score.
     573        4430 :         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     574        3987 :           MachineOperand &DefMO = Inst.getOperand(I);
     575        6202 :           if (DefMO.isReg() && DefMO.isDef() &&
     576           0 :               TRI->isVGPR(MRIA, DefMO.getReg())) {
     577           0 :             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
     578             :                         CurrScore);
     579             :           }
     580             :         }
     581             :       }
     582        4430 :       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     583        3987 :         MachineOperand &MO = Inst.getOperand(I);
     584        6202 :         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
     585        1772 :           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
     586             :         }
     587             :       }
     588             :     }
     589             : #if 0 // TODO: check if this is handled by MUBUF code above.
     590             :   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
     591             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
     592             :        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     593             :     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     594             :     unsigned OpNo;//TODO: find the OpNo for this operand;
     595             :     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     596             :     for (signed RegNo = Interval.first; RegNo < Interval.second;
     597             :     ++RegNo) {
     598             :       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     599             :     }
     600             : #endif
     601             :   } else {
     602             :     // Match the score to the destination registers.
     603      734219 :     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
     604      636563 :       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
     605      636563 :       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
     606           0 :         continue;
     607      865385 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     608             :         setRegScore(RegNo, T, CurrScore);
     609             :       }
     610             :     }
     611       97656 :     if (TII->isDS(Inst) && Inst.mayStore()) {
     612             :       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     613             :     }
     614             :   }
     615      121799 : }
     616             : 
     617             : void BlockWaitcntBrackets::print(raw_ostream &OS) {
     618             :   OS << '\n';
     619             :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     620             :        T = (enum InstCounterType)(T + 1)) {
     621             :     int LB = getScoreLB(T);
     622             :     int UB = getScoreUB(T);
     623             : 
     624             :     switch (T) {
     625             :     case VM_CNT:
     626             :       OS << "    VM_CNT(" << UB - LB << "): ";
     627             :       break;
     628             :     case LGKM_CNT:
     629             :       OS << "    LGKM_CNT(" << UB - LB << "): ";
     630             :       break;
     631             :     case EXP_CNT:
     632             :       OS << "    EXP_CNT(" << UB - LB << "): ";
     633             :       break;
     634             :     default:
     635             :       OS << "    UNKNOWN(" << UB - LB << "): ";
     636             :       break;
     637             :     }
     638             : 
     639             :     if (LB < UB) {
     640             :       // Print vgpr scores.
     641             :       for (int J = 0; J <= getMaxVGPR(); J++) {
     642             :         int RegScore = getRegScore(J, T);
     643             :         if (RegScore <= LB)
     644             :           continue;
     645             :         int RelScore = RegScore - LB - 1;
     646             :         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
     647             :           OS << RelScore << ":v" << J << " ";
     648             :         } else {
     649             :           OS << RelScore << ":ds ";
     650             :         }
     651             :       }
     652             :       // Also need to print sgpr scores for lgkm_cnt.
     653             :       if (T == LGKM_CNT) {
     654             :         for (int J = 0; J <= getMaxSGPR(); J++) {
     655             :           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
     656             :           if (RegScore <= LB)
     657             :             continue;
     658             :           int RelScore = RegScore - LB - 1;
     659             :           OS << RelScore << ":s" << J << " ";
     660             :         }
     661             :       }
     662             :     }
     663             :     OS << '\n';
     664             :   }
     665             :   OS << '\n';
     666             : }
     667             : 
     668     2576931 : unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
     669             :                                                 int ScoreToWait) {
     670             :   unsigned int NeedWait = 0;
     671     2576931 :   if (ScoreToWait == -1) {
     672             :     // The score to wait is unknown. This implies that it was not encountered
     673             :     // during the path of the CFG walk done during the current traversal but
     674             :     // may be seen on a different path. Emit an s_wait counter with a
     675             :     // conservative value of 0 for the counter.
     676           0 :     NeedWait = CNT_MASK(T);
     677             :     setScoreLB(T, getScoreUB(T));
     678             :     return NeedWait;
     679             :   }
     680             : 
     681             :   // If the score of src_operand falls within the bracket, we need an
     682             :   // s_waitcnt instruction.
     683             :   const int32_t LB = getScoreLB(T);
     684             :   const int32_t UB = getScoreUB(T);
     685     2576931 :   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     686       42416 :     if (T == VM_CNT && hasPendingFlat()) {
     687             :       // If there is a pending FLAT operation, and this is a VM waitcnt,
     688             :       // then we need to force a waitcnt 0 for VM.
     689         780 :       NeedWait = CNT_MASK(T);
     690             :       setScoreLB(T, getScoreUB(T));
     691       41636 :     } else if (counterOutOfOrder(T)) {
     692             :       // Counter can get decremented out-of-order when there
     693             :       // are multiple types event in the brack. Also emit an s_wait counter
     694             :       // with a conservative value of 0 for the counter.
     695       15857 :       NeedWait = CNT_MASK(T);
     696             :       setScoreLB(T, getScoreUB(T));
     697             :     } else {
     698       25779 :       NeedWait = CNT_MASK(T);
     699             :       setScoreLB(T, ScoreToWait);
     700             :     }
     701             :   }
     702             : 
     703             :   return NeedWait;
     704             : }
     705             : 
     706             : // Where there are multiple types of event in the bracket of a counter,
     707             : // the decrement may go out of order.
     708       65779 : bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
     709       65779 :   switch (T) {
     710             :   case VM_CNT:
     711             :     return false;
     712       20357 :   case LGKM_CNT: {
     713       36214 :     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
     714       15857 :         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     715             :       // Scalar memory read always can go out of order.
     716             :       return true;
     717             :     }
     718             :     int NumEventTypes = 0;
     719        9000 :     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     720        4500 :         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     721             :       NumEventTypes++;
     722             :     }
     723        4500 :     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
     724           0 :         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
     725           0 :       NumEventTypes++;
     726             :     }
     727        4500 :     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
     728           0 :         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
     729           0 :       NumEventTypes++;
     730             :     }
     731        4500 :     if (NumEventTypes <= 1) {
     732             :       return false;
     733             :     }
     734             :     break;
     735             :   }
     736       30641 :   case EXP_CNT: {
     737             :     // If there has been a mixture of export types, then a waitcnt exp(0) is
     738             :     // required.
     739       30641 :     if (MixedExpTypes)
     740             :       return true;
     741             :     int NumEventTypes = 0;
     742       30935 :     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     743         294 :         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     744             :       NumEventTypes++;
     745             :     }
     746       30641 :     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     747           0 :         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     748           0 :       NumEventTypes++;
     749             :     }
     750       60829 :     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
     751       30188 :         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
     752       30188 :       NumEventTypes++;
     753             :     }
     754       30771 :     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
     755         130 :         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
     756         130 :       NumEventTypes++;
     757             :     }
     758             : 
     759       30678 :     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
     760          37 :         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
     761          37 :       NumEventTypes++;
     762             :     }
     763             : 
     764       30641 :     if (NumEventTypes <= 1) {
     765             :       return false;
     766             :     }
     767             :     break;
     768             :   }
     769             :   default:
     770             :     break;
     771             :   }
     772           8 :   return true;
     773             : }
     774             : 
     775       59438 : INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     776             :                       false)
     777      279728 : INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
     778             :                     false)
     779             : 
     780             : char SIInsertWaitcnts::ID = 0;
     781             : 
     782             : char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
     783             : 
     784        1686 : FunctionPass *llvm::createSIInsertWaitcntsPass() {
     785        1686 :   return new SIInsertWaitcnts();
     786             : }
     787             : 
     788             : static bool readsVCCZ(const MachineInstr &MI) {
     789      583449 :   unsigned Opc = MI.getOpcode();
     790      583885 :   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
     791         436 :          !MI.getOperand(1).isUndef();
     792             : }
     793             : 
     794             : /// \brief Given wait count encodings checks if LHS is stronger than RHS.
     795        1124 : bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
     796        1124 :   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
     797             :     return false;
     798        1124 :   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
     799             :     return false;
     800        1115 :   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
     801             :     return false;
     802        1115 :   return true;
     803             : }
     804             : 
     805             : /// \brief Given wait count encodings create a new encoding which is stronger
     806             : /// or equal to both.
     807           0 : unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
     808           0 :   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
     809           0 :                             AMDGPU::decodeVmcnt(IV, RHS));
     810           0 :   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
     811           0 :                               AMDGPU::decodeLgkmcnt(IV, RHS));
     812           0 :   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
     813           0 :                              AMDGPU::decodeExpcnt(IV, RHS));
     814           0 :   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
     815             : }
     816             : 
     817             : ///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
     818             : ///  Instructions of a given type are returned in order,
     819             : ///  but instructions of different types can complete out of order.
     820             : ///  We rely on this in-order completion
     821             : ///  and simply assign a score to the memory access instructions.
     822             : ///  We keep track of the active "score bracket" to determine
     823             : ///  if an access of a memory read requires an s_waitcnt
     824             : ///  and if so what the value of each counter is.
     825             : ///  The "score bracket" is bound by the lower bound and upper bound
     826             : ///  scores (*_score_LB and *_score_ub respectively).
     827      291733 : void SIInsertWaitcnts::generateSWaitCntInstBefore(
     828             :     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
     829             :   // To emit, or not to emit - that's the question!
     830             :   // Start with an assumption that there is no need to emit.
     831             :   unsigned int EmitSwaitcnt = 0;
     832             :   // No need to wait before phi. If a phi-move exists, then the wait should
     833             :   // has been inserted before the move. If a phi-move does not exist, then
     834             :   // wait should be inserted before the real use. The same is true for
     835             :   // sc-merge. It is not a coincident that all these cases correspond to the
     836             :   // instructions that are skipped in the assembling loop.
     837             :   bool NeedLineMapping = false; // TODO: Check on this.
     838      291733 :   if (MI.isDebugValue() &&
     839             :       // TODO: any other opcode?
     840             :       !NeedLineMapping) {
     841             :     return;
     842             :   }
     843             : 
     844             :   // See if an s_waitcnt is forced at block entry, or is needed at
     845             :   // program end.
     846      291716 :   if (ScoreBrackets->getWaitAtBeginning()) {
     847             :     // Note that we have already cleared the state, so we don't need to update
     848             :     // it.
     849             :     ScoreBrackets->clearWaitAtBeginning();
     850           0 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     851           0 :          T = (enum InstCounterType)(T + 1)) {
     852           0 :       EmitSwaitcnt |= CNT_MASK(T);
     853             :       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     854             :     }
     855             :   }
     856             : 
     857             :   // See if this instruction has a forced S_WAITCNT VM.
     858             :   // TODO: Handle other cases of NeedsWaitcntVmBefore()
     859      291600 :   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
     860      583315 :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
     861             :            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     862             :     EmitSwaitcnt |=
     863        1135 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     864             :   }
     865             : 
     866             :   // All waits must be resolved at call return.
     867             :   // NOTE: this could be improved with knowledge of all call sites or
     868             :   //   with knowledge of the called routines.
     869      583432 :   if (MI.getOpcode() == AMDGPU::RETURN ||
     870      583019 :       MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
     871             :       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
     872       10808 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
     873        4632 :          T = (enum InstCounterType)(T + 1)) {
     874        4632 :       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
     875             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
     876        1412 :         EmitSwaitcnt |= CNT_MASK(T);
     877             :       }
     878             :     }
     879             :   }
     880             :   // Resolve vm waits before gs-done.
     881      290160 :   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
     882      290196 :             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
     883          24 :            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
     884             :             AMDGPU::SendMsg::ID_GS_DONE)) {
     885           6 :     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
     886             :       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     887           0 :       EmitSwaitcnt |= CNT_MASK(VM_CNT);
     888             :     }
     889             :   }
     890             : #if 0 // TODO: the following blocks of logic when we have fence.
     891             :   else if (MI.getOpcode() == SC_FENCE) {
     892             :     const unsigned int group_size =
     893             :       context->shader_info->GetMaxThreadGroupSize();
     894             :     // group_size == 0 means thread group size is unknown at compile time
     895             :     const bool group_is_multi_wave =
     896             :       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
     897             :     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
     898             : 
     899             :     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
     900             :       SCRegType src_type = Inst->GetSrcType(i);
     901             :       switch (src_type) {
     902             :         case SCMEM_LDS:
     903             :           if (group_is_multi_wave ||
     904             :             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
     905             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     906             :                                ScoreBrackets->getScoreUB(LGKM_CNT));
     907             :             // LDS may have to wait for VM_CNT after buffer load to LDS
     908             :             if (target_info->HasBufferLoadToLDS()) {
     909             :               EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     910             :                                  ScoreBrackets->getScoreUB(VM_CNT));
     911             :             }
     912             :           }
     913             :           break;
     914             : 
     915             :         case SCMEM_GDS:
     916             :           if (group_is_multi_wave || fence_is_global) {
     917             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     918             :               ScoreBrackets->getScoreUB(EXP_CNT));
     919             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
     920             :               ScoreBrackets->getScoreUB(LGKM_CNT));
     921             :           }
     922             :           break;
     923             : 
     924             :         case SCMEM_UAV:
     925             :         case SCMEM_TFBUF:
     926             :         case SCMEM_RING:
     927             :         case SCMEM_SCATTER:
     928             :           if (group_is_multi_wave || fence_is_global) {
     929             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
     930             :               ScoreBrackets->getScoreUB(EXP_CNT));
     931             :             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
     932             :               ScoreBrackets->getScoreUB(VM_CNT));
     933             :           }
     934             :           break;
     935             : 
     936             :         case SCMEM_SCRATCH:
     937             :         default:
     938             :           break;
     939             :       }
     940             :     }
     941             :   }
     942             : #endif
     943             : 
     944             :   // Export & GDS instructions do not read the EXEC mask until after the export
     945             :   // is granted (which can occur well after the instruction is issued).
     946             :   // The shader program must flush all EXP operations on the export-count
     947             :   // before overwriting the EXEC mask.
     948             :   else {
     949      580332 :     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
     950             :       // Export and GDS are tracked individually, either may trigger a waitcnt
     951             :       // for EXEC.
     952        2119 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     953             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
     954        2119 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     955             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
     956        2119 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     957             :           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
     958        2119 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     959             :           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
     960             :     }
     961             : 
     962             : #if 0 // TODO: the following code to handle CALL.
     963             :     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
     964             :     // However, there is a problem with EXP_CNT, because the call cannot
     965             :     // easily tell if a register is used in the function, and if it did, then
     966             :     // the referring instruction would have to have an S_WAITCNT, which is
     967             :     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
     968             :     // before the call.
     969             :     if (MI.getOpcode() == SC_CALL) {
     970             :       if (ScoreBrackets->getScoreUB(EXP_CNT) >
     971             :         ScoreBrackets->getScoreLB(EXP_CNT)) {
     972             :         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
     973             :         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
     974             :       }
     975             :     }
     976             : #endif
     977             : 
     978             :     // FIXME: Should not be relying on memoperands.
     979             :     // Look at the source operands of every instruction to see if
     980             :     // any of them results from a previous memory operation that affects
     981             :     // its current usage. If so, an s_waitcnt instruction needs to be
     982             :     // emitted.
     983             :     // If the source operand was defined by a load, add the s_waitcnt
     984             :     // instruction.
     985      473682 :     for (const MachineMemOperand *Memop : MI.memoperands()) {
     986             :       unsigned AS = Memop->getAddrSpace();
     987       91758 :       if (AS != AMDGPUASI.LOCAL_ADDRESS)
     988       82383 :         continue;
     989             :       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
     990             :       // VM_CNT is only relevant to vgpr or LDS.
     991        9375 :       EmitSwaitcnt |= ScoreBrackets->updateByWait(
     992             :           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     993             :     }
     994             : 
     995     1584662 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     996     1294496 :       const MachineOperand &Op = MI.getOperand(I);
     997     1294496 :       const MachineRegisterInfo &MRIA = *MRI;
     998             :       RegInterval Interval =
     999     1294496 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
    1000     2709029 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1001     1414533 :         if (TRI->isVGPR(MRIA, Op.getReg())) {
    1002             :           // VM_CNT is only relevant to vgpr or LDS.
    1003      402016 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1004             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1005             :         }
    1006     1414533 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1007             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1008             :       }
    1009             :     }
    1010             :     // End of for loop that looks at all source operands to decide vm_wait_cnt
    1011             :     // and lgk_wait_cnt.
    1012             : 
    1013             :     // Two cases are handled for destination operands:
    1014             :     // 1) If the destination operand was defined by a load, add the s_waitcnt
    1015             :     // instruction to guarantee the right WAW order.
    1016             :     // 2) If a destination operand that was used by a recent export/store ins,
    1017             :     // add s_waitcnt on exp_cnt to guarantee the WAR order.
    1018      290166 :     if (MI.mayStore()) {
    1019             :       // FIXME: Should not be relying on memoperands.
    1020      108653 :       for (const MachineMemOperand *Memop : MI.memoperands()) {
    1021             :         unsigned AS = Memop->getAddrSpace();
    1022       36264 :         if (AS != AMDGPUASI.LOCAL_ADDRESS)
    1023       30488 :           continue;
    1024             :         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
    1025        5776 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1026             :             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1027        5776 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1028             :             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1029             :       }
    1030             :     }
    1031     1584662 :     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
    1032     1294496 :       MachineOperand &Def = MI.getOperand(I);
    1033     1294496 :       const MachineRegisterInfo &MRIA = *MRI;
    1034             :       RegInterval Interval =
    1035     1294496 :           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
    1036     1657137 :       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    1037      362641 :         if (TRI->isVGPR(MRIA, Def.getReg())) {
    1038      183504 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1039             :               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
    1040      183504 :           EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1041             :               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
    1042             :         }
    1043      362641 :         EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1044             :             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
    1045             :       }
    1046             :     } // End of for loop that looks at all dest operands.
    1047             :   }
    1048             : 
    1049             :   // TODO: Tie force zero to a compiler triage option.
    1050             :   bool ForceZero = false;
    1051             : 
    1052             :   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
    1053             :   // occurs before the instruction. Doing it here prevents any additional
    1054             :   // S_WAITCNTs from being emitted if the instruction was marked as
    1055             :   // requiring a WAITCNT beforehand.
    1056      583499 :   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
    1057          67 :       !ST->hasAutoWaitcntBeforeBarrier()) {
    1058          65 :     EmitSwaitcnt |=
    1059          65 :         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1060          65 :     EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1061             :         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1062          65 :     EmitSwaitcnt |= ScoreBrackets->updateByWait(
    1063             :         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
    1064             :   }
    1065             : 
    1066             :   // TODO: Remove this work-around, enable the assert for Bug 457939
    1067             :   //       after fixing the scheduler. Also, the Shader Compiler code is
    1068             :   //       independent of target.
    1069         218 :   if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
    1070             :     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1071         148 :             ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1072          21 :         ScoreBrackets->hasPendingSMEM()) {
    1073             :       // Wait on everything, not just LGKM.  vccz reads usually come from
    1074             :       // terminators, and we always wait on everything at the end of the
    1075             :       // block, so if we only wait on LGKM here, we might end up with
    1076             :       // another s_waitcnt inserted right after this if there are non-LGKM
    1077             :       // instructions still outstanding.
    1078             :       ForceZero = true;
    1079             :       EmitSwaitcnt = true;
    1080             :     }
    1081             :   }
    1082             : 
    1083             :   // Does this operand processing indicate s_wait counter update?
    1084      291709 :   if (EmitSwaitcnt) {
    1085             :     int CntVal[NUM_INST_CNTS];
    1086             : 
    1087             :     bool UseDefaultWaitcntStrategy = true;
    1088       37180 :     if (ForceZero) {
    1089             :       // Force all waitcnts to 0.
    1090          49 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1091          21 :            T = (enum InstCounterType)(T + 1)) {
    1092             :         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1093             :       }
    1094           7 :       CntVal[VM_CNT] = 0;
    1095           7 :       CntVal[EXP_CNT] = 0;
    1096           7 :       CntVal[LGKM_CNT] = 0;
    1097             :       UseDefaultWaitcntStrategy = false;
    1098             :     }
    1099             : 
    1100             :     if (UseDefaultWaitcntStrategy) {
    1101      260211 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1102      111519 :            T = (enum InstCounterType)(T + 1)) {
    1103      111519 :         if (EmitSwaitcnt & CNT_MASK(T)) {
    1104             :           int Delta =
    1105       40062 :               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
    1106             :           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
    1107       40062 :           if (Delta >= MaxDelta) {
    1108             :             Delta = -1;
    1109          16 :             if (T != EXP_CNT) {
    1110          16 :               ScoreBrackets->setScoreLB(
    1111             :                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
    1112             :             }
    1113          16 :             EmitSwaitcnt &= ~CNT_MASK(T);
    1114             :           }
    1115       40062 :           CntVal[T] = Delta;
    1116             :         } else {
    1117             :           // If we are not waiting for a particular counter then encode
    1118             :           // it as -1 which means "don't care."
    1119       71457 :           CntVal[T] = -1;
    1120             :         }
    1121             :       }
    1122             :     }
    1123             : 
    1124             :     // If we are not waiting on any counter we can skip the wait altogether.
    1125       37180 :     if (EmitSwaitcnt != 0) {
    1126       37167 :       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
    1127       37167 :       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
    1128        1365 :       if (!OldWaitcnt ||
    1129        1365 :           (AMDGPU::decodeVmcnt(IV, Imm) !=
    1130        2719 :                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
    1131        1354 :           (AMDGPU::decodeExpcnt(IV, Imm) !=
    1132       39873 :            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
    1133        1352 :           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
    1134        1352 :            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
    1135       35815 :         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
    1136         231 :         if (ContainingLoop) {
    1137         231 :           MachineBasicBlock *TBB = ContainingLoop->getHeader();
    1138             :           BlockWaitcntBrackets *ScoreBracket =
    1139         231 :               BlockWaitcntBracketsMap[TBB].get();
    1140         231 :           if (!ScoreBracket) {
    1141             :             assert(!BlockVisitedSet.count(TBB));
    1142             :             BlockWaitcntBracketsMap[TBB] =
    1143             :                 llvm::make_unique<BlockWaitcntBrackets>();
    1144             :             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
    1145             :           }
    1146             :           ScoreBracket->setRevisitLoop(true);
    1147             :           DEBUG(dbgs() << "set-revisit: Block"
    1148             :                        << ContainingLoop->getHeader()->getNumber() << '\n';);
    1149             :         }
    1150             :       }
    1151             : 
    1152             :       // Update an existing waitcount, or make a new one.
    1153       74334 :       unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
    1154      111501 :                                            CntVal[EXP_CNT], CntVal[LGKM_CNT]);
    1155             :       // We don't remove waitcnts that existed prior to the waitcnt
    1156             :       // pass. Check if the waitcnt to-be-inserted can be avoided
    1157             :       // or if the prev waitcnt can be updated.
    1158             :       bool insertSWaitInst = true;
    1159       37167 :       for (MachineBasicBlock::iterator I = MI.getIterator(),
    1160       37167 :                                        B = MI.getParent()->begin();
    1161       74044 :            insertSWaitInst && I != B; --I) {
    1162       72967 :         if (I == MI.getIterator())
    1163             :           continue;
    1164             : 
    1165       72180 :         switch (I->getOpcode()) {
    1166             :         case AMDGPU::S_WAITCNT:
    1167        1124 :           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
    1168             :             insertSWaitInst = false;
    1169           9 :           else if (!OldWaitcnt) {
    1170             :             OldWaitcnt = &*I;
    1171           0 :             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
    1172             :           }
    1173             :           break;
    1174             :         // TODO: skip over instructions which never require wait.
    1175             :         }
    1176             :         break;
    1177             :       }
    1178       37167 :       if (insertSWaitInst) {
    1179       36302 :         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
    1180         250 :           OldWaitcnt->getOperand(0).setImm(Enc);
    1181         250 :           if (!OldWaitcnt->getParent())
    1182         241 :             MI.getParent()->insert(MI, OldWaitcnt);
    1183             : 
    1184             :           DEBUG(dbgs() << "updateWaitcntInBlock\n"
    1185             :                        << "Old Instr: " << MI << '\n'
    1186             :                        << "New Instr: " << *OldWaitcnt << '\n');
    1187             :         } else {
    1188       35802 :             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
    1189       35802 :                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1190       35802 :                              .addImm(Enc);
    1191       71604 :             TrackedWaitcntSet.insert(SWaitInst);
    1192             : 
    1193             :             DEBUG(dbgs() << "insertWaitcntInBlock\n"
    1194             :                          << "Old Instr: " << MI << '\n'
    1195             :                          << "New Instr: " << *SWaitInst << '\n');
    1196             :         }
    1197             :       }
    1198             : 
    1199       37167 :       if (CntVal[EXP_CNT] == 0) {
    1200             :         ScoreBrackets->setMixedExpTypes(false);
    1201             :       }
    1202             :     }
    1203             :   }
    1204             : }
    1205             : 
    1206           0 : void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
    1207             :                                              MachineInstr *Waitcnt) {
    1208           0 :   if (MBB.empty()) {
    1209             :     MBB.push_back(Waitcnt);
    1210           0 :     return;
    1211             :   }
    1212             : 
    1213           0 :   MachineBasicBlock::iterator It = MBB.end();
    1214             :   MachineInstr *MI = &*(--It);
    1215           0 :   if (MI->isBranch()) {
    1216             :     MBB.insert(It, Waitcnt);
    1217             :   } else {
    1218             :     MBB.push_back(Waitcnt);
    1219             :   }
    1220             : }
    1221             : 
    1222             : // This is a flat memory operation. Check to see if it has memory
    1223             : // tokens for both LDS and Memory, and if so mark it as a flat.
    1224        8543 : bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
    1225        8543 :   if (MI.memoperands_empty())
    1226             :     return true;
    1227             : 
    1228       22479 :   for (const MachineMemOperand *Memop : MI.memoperands()) {
    1229             :     unsigned AS = Memop->getAddrSpace();
    1230        8530 :     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
    1231             :       return true;
    1232             :   }
    1233             : 
    1234             :   return false;
    1235             : }
    1236             : 
    1237      291733 : void SIInsertWaitcnts::updateEventWaitCntAfter(
    1238             :     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
    1239             :   // Now look at the instruction opcode. If it is a memory access
    1240             :   // instruction, update the upper-bound of the appropriate counter's
    1241             :   // bracket and the destination operand scores.
    1242             :   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
    1243      298393 :   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
    1244        6660 :     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
    1245           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
    1246           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
    1247             :     } else {
    1248        6660 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1249             :     }
    1250      285073 :   } else if (TII->isFLAT(Inst)) {
    1251             :     assert(Inst.mayLoad() || Inst.mayStore());
    1252             : 
    1253       10669 :     if (TII->usesVM_CNT(Inst))
    1254       10669 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1255             : 
    1256       10669 :     if (TII->usesLGKM_CNT(Inst)) {
    1257        8543 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
    1258             : 
    1259             :       // This is a flat memory operation, so note it - it will require
    1260             :       // that both the VM and LGKM be flushed to zero if it is pending when
    1261             :       // a VM or LGKM dependency occurs.
    1262        8543 :       if (mayAccessLDSThroughFlat(Inst))
    1263             :         ScoreBrackets->setPendingFlat();
    1264             :     }
    1265       41390 :   } else if (SIInstrInfo::isVMEM(Inst) &&
    1266             :              // TODO: get a better carve out.
    1267       41274 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
    1268       41273 :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
    1269             :              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
    1270       40255 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
    1271             :     if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
    1272       56810 :         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
    1273       23700 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
    1274             :     }
    1275      234149 :   } else if (TII->isSMRD(Inst)) {
    1276       31505 :     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1277             :   } else {
    1278      202644 :     switch (Inst.getOpcode()) {
    1279          24 :     case AMDGPU::S_SENDMSG:
    1280             :     case AMDGPU::S_SENDMSGHALT:
    1281          24 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
    1282          24 :       break;
    1283         443 :     case AMDGPU::EXP:
    1284             :     case AMDGPU::EXP_DONE: {
    1285         443 :       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
    1286         443 :       if (Imm >= 32 && Imm <= 63)
    1287         120 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
    1288         323 :       else if (Imm >= 12 && Imm <= 15)
    1289          33 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
    1290             :       else
    1291         290 :         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
    1292             :       break;
    1293             :     }
    1294           0 :     case AMDGPU::S_MEMTIME:
    1295             :     case AMDGPU::S_MEMREALTIME:
    1296           0 :       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
    1297           0 :       break;
    1298             :     default:
    1299             :       break;
    1300             :     }
    1301             :   }
    1302      291733 : }
    1303             : 
    1304       18956 : void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
    1305       37912 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1306       18956 :   int32_t MaxPending[NUM_INST_CNTS] = {0};
    1307       18956 :   int32_t MaxFlat[NUM_INST_CNTS] = {0};
    1308             :   bool MixedExpTypes = false;
    1309             : 
    1310             :   // Clear the score bracket state.
    1311       18956 :   ScoreBrackets->clear();
    1312             : 
    1313             :   // Compute the number of pending elements on block entry.
    1314             : 
    1315             :   // IMPORTANT NOTE: If iterative handling of loops is added, the code will
    1316             :   // need to handle single BBs with backedges to themselves. This means that
    1317             :   // they will need to retain and not clear their initial state.
    1318             : 
    1319             :   // See if there are any uninitialized predecessors. If so, emit an
    1320             :   // s_waitcnt 0 at the beginning of the block.
    1321       22536 :   for (MachineBasicBlock *pred : Block.predecessors()) {
    1322             :     BlockWaitcntBrackets *PredScoreBrackets =
    1323             :         BlockWaitcntBracketsMap[pred].get();
    1324        3580 :     bool Visited = BlockVisitedSet.count(pred);
    1325        3502 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1326          78 :       continue;
    1327             :     }
    1328       24514 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1329       10506 :          T = (enum InstCounterType)(T + 1)) {
    1330             :       int span =
    1331       10506 :           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
    1332       21012 :       MaxPending[T] = std::max(MaxPending[T], span);
    1333       10506 :       span =
    1334       10506 :           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
    1335       21012 :       MaxFlat[T] = std::max(MaxFlat[T], span);
    1336             :     }
    1337             : 
    1338        3502 :     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
    1339             :   }
    1340             : 
    1341             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1342             :   // Also handle kills for exit block.
    1343       35569 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1344        6067 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1345       13461 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1346        5769 :            T = (enum InstCounterType)(T + 1)) {
    1347        5769 :         int Span = KillWaitBrackets[I]->getScoreUB(T) -
    1348        5769 :                    KillWaitBrackets[I]->getScoreLB(T);
    1349       11538 :         MaxPending[T] = std::max(MaxPending[T], Span);
    1350        5769 :         Span = KillWaitBrackets[I]->pendingFlat(T) -
    1351             :                KillWaitBrackets[I]->getScoreLB(T);
    1352       11538 :         MaxFlat[T] = std::max(MaxFlat[T], Span);
    1353             :       }
    1354             : 
    1355        1923 :       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
    1356             :     }
    1357             :   }
    1358             : 
    1359             :   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
    1360       22536 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1361             :     BlockWaitcntBrackets *PredScoreBrackets =
    1362             :         BlockWaitcntBracketsMap[Pred].get();
    1363        3580 :     bool Visited = BlockVisitedSet.count(Pred);
    1364        3580 :     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
    1365          78 :       continue;
    1366             :     }
    1367             : 
    1368        3502 :     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
    1369        3502 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1370        3502 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1371        3502 :     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
    1372        3502 :                   PredScoreBrackets->getScoreLB(EXP_CNT);
    1373        3502 :     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1374             :   }
    1375             : 
    1376             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1377       35569 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1378        6067 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1379        1923 :       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
    1380        1923 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1381        1923 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
    1382        1923 :       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
    1383        1923 :                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
    1384        1923 :       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
    1385             :     }
    1386             :   }
    1387             : 
    1388             : #if 0
    1389             :   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
    1390             :   // TODO: how does LC distinguish between function entry and main entry?
    1391             :   // If this is the entry to a function, force a wait.
    1392             :   MachineBasicBlock &Entry = Block.getParent()->front();
    1393             :   if (Entry.getNumber() == Block.getNumber()) {
    1394             :     ScoreBrackets->setWaitAtBeginning();
    1395             :     return;
    1396             :   }
    1397             : #endif
    1398             : 
    1399             :   // Now set the current Block's brackets to the largest ending bracket.
    1400      132692 :   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1401       56868 :        T = (enum InstCounterType)(T + 1)) {
    1402       56868 :     ScoreBrackets->setScoreUB(T, MaxPending[T]);
    1403             :     ScoreBrackets->setScoreLB(T, 0);
    1404       56868 :     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
    1405             :   }
    1406             : 
    1407             :   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
    1408             : 
    1409             :   // Set the register scoreboard.
    1410       22536 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1411          78 :     if (!BlockVisitedSet.count(Pred)) {
    1412          78 :       continue;
    1413             :     }
    1414             : 
    1415             :     BlockWaitcntBrackets *PredScoreBrackets =
    1416             :         BlockWaitcntBracketsMap[Pred].get();
    1417             : 
    1418             :     // Now merge the gpr_reg_score information
    1419       24514 :     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1420       10506 :          T = (enum InstCounterType)(T + 1)) {
    1421             :       int PredLB = PredScoreBrackets->getScoreLB(T);
    1422             :       int PredUB = PredScoreBrackets->getScoreUB(T);
    1423       10506 :       if (PredLB < PredUB) {
    1424        2075 :         int PredScale = MaxPending[T] - PredUB;
    1425             :         // Merge vgpr scores.
    1426      160451 :         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
    1427             :           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
    1428       79188 :           if (PredRegScore <= PredLB)
    1429       71273 :             continue;
    1430        7915 :           int NewRegScore = PredScale + PredRegScore;
    1431        7915 :           ScoreBrackets->setRegScore(
    1432       15830 :               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1433             :         }
    1434             :         // Also need to merge sgpr scores for lgkm_cnt.
    1435        2075 :         if (T == LGKM_CNT) {
    1436        9940 :           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
    1437             :             int PredRegScore =
    1438        4564 :                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1439        4564 :             if (PredRegScore <= PredLB)
    1440        3130 :               continue;
    1441        1434 :             int NewRegScore = PredScale + PredRegScore;
    1442        1434 :             ScoreBrackets->setRegScore(
    1443             :                 J + NUM_ALL_VGPRS, LGKM_CNT,
    1444             :                 std::max(
    1445        2868 :                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1446             :                     NewRegScore));
    1447             :           }
    1448             :         }
    1449             :       }
    1450             :     }
    1451             : 
    1452             :     // Also merge the WaitEvent information.
    1453       73542 :     ForAllWaitEventType(W) {
    1454             :       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
    1455             :       int PredEventUB = PredScoreBrackets->getEventUB(W);
    1456       35020 :       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
    1457             :         int NewEventUB =
    1458        4034 :             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
    1459        2017 :         if (NewEventUB > 0) {
    1460        2017 :           ScoreBrackets->setEventUB(
    1461        4034 :               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1462             :         }
    1463             :       }
    1464             :     }
    1465             :   }
    1466             : 
    1467             :   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
    1468             :   // Set the register scoreboard.
    1469       35569 :   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
    1470        6067 :     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
    1471             :       // Now merge the gpr_reg_score information.
    1472       13461 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1473        5769 :            T = (enum InstCounterType)(T + 1)) {
    1474        5769 :         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
    1475             :         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
    1476        5769 :         if (PredLB < PredUB) {
    1477        1192 :           int PredScale = MaxPending[T] - PredUB;
    1478             :           // Merge vgpr scores.
    1479       10817 :           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
    1480             :             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
    1481        2811 :             if (PredRegScore <= PredLB)
    1482        2338 :               continue;
    1483         473 :             int NewRegScore = PredScale + PredRegScore;
    1484         473 :             ScoreBrackets->setRegScore(
    1485         946 :                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
    1486             :           }
    1487             :           // Also need to merge sgpr scores for lgkm_cnt.
    1488        1192 :           if (T == LGKM_CNT) {
    1489       33229 :             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
    1490             :               int PredRegScore =
    1491       10537 :                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    1492       10537 :               if (PredRegScore <= PredLB)
    1493        8766 :                 continue;
    1494        1771 :               int NewRegScore = PredScale + PredRegScore;
    1495        1771 :               ScoreBrackets->setRegScore(
    1496             :                   J + NUM_ALL_VGPRS, LGKM_CNT,
    1497             :                   std::max(
    1498        3542 :                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
    1499             :                       NewRegScore));
    1500             :             }
    1501             :           }
    1502             :         }
    1503             :       }
    1504             : 
    1505             :       // Also merge the WaitEvent information.
    1506       40383 :       ForAllWaitEventType(W) {
    1507       19230 :         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
    1508             :         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
    1509       19230 :         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
    1510             :           int NewEventUB =
    1511        2408 :               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
    1512        1204 :           if (NewEventUB > 0) {
    1513        1204 :             ScoreBrackets->setEventUB(
    1514        2408 :                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
    1515             :           }
    1516             :         }
    1517             :       }
    1518             :     }
    1519             :   }
    1520             : 
    1521             :   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
    1522             :   // sequencing predecessors, because changes to EXEC require waitcnts due to
    1523             :   // the delayed nature of these operations.
    1524       22536 :   for (MachineBasicBlock *Pred : Block.predecessors()) {
    1525          78 :     if (!BlockVisitedSet.count(Pred)) {
    1526          78 :       continue;
    1527             :     }
    1528             : 
    1529             :     BlockWaitcntBrackets *PredScoreBrackets =
    1530             :         BlockWaitcntBracketsMap[Pred].get();
    1531             : 
    1532             :     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
    1533        3502 :     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1534           0 :       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
    1535           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1536           0 :       if (new_gds_ub > 0) {
    1537           0 :         ScoreBrackets->setEventUB(
    1538             :             GDS_GPR_LOCK,
    1539           0 :             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
    1540             :       }
    1541             :     }
    1542             :     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
    1543        3502 :     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
    1544           0 :       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
    1545           0 :                        PredScoreBrackets->getScoreUB(EXP_CNT);
    1546           0 :       if (new_exp_ub > 0) {
    1547           0 :         ScoreBrackets->setEventUB(
    1548             :             EXP_GPR_LOCK,
    1549           0 :             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
    1550             :       }
    1551             :     }
    1552             :   }
    1553       18956 : }
    1554             : 
    1555             : /// Return the "bottom" block of a loop. This differs from
    1556             : /// MachineLoop::getBottomBlock in that it works even if the loop is
    1557             : /// discontiguous.
    1558             : MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
    1559             :   MachineBasicBlock *Bottom = Loop->getHeader();
    1560        8784 :   for (MachineBasicBlock *MBB : Loop->blocks())
    1561        3724 :     if (MBB->getNumber() > Bottom->getNumber())
    1562             :       Bottom = MBB;
    1563             :   return Bottom;
    1564             : }
    1565             : 
    1566             : // Generate s_waitcnt instructions where needed.
    1567       18956 : void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    1568             :                                             MachineBasicBlock &Block) {
    1569             :   // Initialize the state information.
    1570       18956 :   mergeInputScoreBrackets(Block);
    1571             : 
    1572       37912 :   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
    1573             : 
    1574             :   DEBUG({
    1575             :     dbgs() << "Block" << Block.getNumber();
    1576             :     ScoreBrackets->dump();
    1577             :   });
    1578             : 
    1579             :   // Walk over the instructions.
    1580       18956 :   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
    1581      313248 :        Iter != E;) {
    1582             :     MachineInstr &Inst = *Iter;
    1583             :     // Remove any previously existing waitcnts.
    1584      591143 :     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
    1585             :       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
    1586             :       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
    1587             :       // as needed.
    1588             :       if (!TrackedWaitcntSet.count(&Inst))
    1589             :         ++Iter;
    1590             :       else {
    1591             :         ++Iter;
    1592         241 :         Inst.removeFromParent();
    1593             :       }
    1594             :       ScoreBrackets->setWaitcnt(&Inst);
    1595        2559 :       continue;
    1596             :     }
    1597             : 
    1598             :     // Kill instructions generate a conditional branch to the endmain block.
    1599             :     // Merge the current waitcnt state into the endmain block information.
    1600             :     // TODO: Are there other flavors of KILL instruction?
    1601      291733 :     if (Inst.getOpcode() == AMDGPU::KILL) {
    1602         271 :       addKillWaitBracket(ScoreBrackets);
    1603             :     }
    1604             : 
    1605             :     bool VCCZBugWorkAround = false;
    1606             :     if (readsVCCZ(Inst) &&
    1607             :         (!VCCZBugHandledSet.count(&Inst))) {
    1608             :       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
    1609         218 :               ScoreBrackets->getScoreUB(LGKM_CNT) &&
    1610          22 :           ScoreBrackets->hasPendingSMEM()) {
    1611           8 :         if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
    1612             :           VCCZBugWorkAround = true;
    1613             :       }
    1614             :     }
    1615             : 
    1616             :     // Generate an s_waitcnt instruction to be placed before
    1617             :     // cur_Inst, if needed.
    1618      291733 :     generateSWaitCntInstBefore(Inst, ScoreBrackets);
    1619             : 
    1620      291733 :     updateEventWaitCntAfter(Inst, ScoreBrackets);
    1621             : 
    1622             : #if 0 // TODO: implement resource type check controlled by options with ub = LB.
    1623             :     // If this instruction generates a S_SETVSKIP because it is an
    1624             :     // indexed resource, and we are on Tahiti, then it will also force
    1625             :     // an S_WAITCNT vmcnt(0)
    1626             :     if (RequireCheckResourceType(Inst, context)) {
    1627             :       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
    1628             :       ScoreBrackets->setScoreLB(VM_CNT,
    1629             :       ScoreBrackets->getScoreUB(VM_CNT));
    1630             :     }
    1631             : #endif
    1632             : 
    1633             :     ScoreBrackets->clearWaitcnt();
    1634             : 
    1635             :     DEBUG({
    1636             :       Inst.print(dbgs());
    1637             :       ScoreBrackets->dump();
    1638             :     });
    1639             : 
    1640             :     // Check to see if this is a GWS instruction. If so, and if this is CI or
    1641             :     // VI, then the generated code sequence will include an S_WAITCNT 0.
    1642             :     // TODO: Are these the only GWS instructions?
    1643      583466 :     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
    1644      291733 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
    1645      291733 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
    1646      583466 :         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
    1647             :         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
    1648             :       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
    1649           0 :       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    1650           0 :       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
    1651           0 :       ScoreBrackets->updateByWait(LGKM_CNT,
    1652             :                                   ScoreBrackets->getScoreUB(LGKM_CNT));
    1653             :     }
    1654             : 
    1655             :     // TODO: Remove this work-around after fixing the scheduler and enable the
    1656             :     // assert above.
    1657      291733 :     if (VCCZBugWorkAround) {
    1658             :       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    1659             :       // bit is updated, so we can restore the bit by reading the value of
    1660             :       // vcc and then writing it back to the register.
    1661          21 :       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    1662           7 :               AMDGPU::VCC)
    1663           7 :           .addReg(AMDGPU::VCC);
    1664          14 :       VCCZBugHandledSet.insert(&Inst);
    1665             :     }
    1666             : 
    1667             :     ++Iter;
    1668             :   }
    1669             : 
    1670             :   // Check if we need to force convergence at loop footer.
    1671       37912 :   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
    1672       19624 :   if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
    1673         392 :     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1674             :     WaitcntData->print();
    1675             :     DEBUG(dbgs() << '\n';);
    1676             : 
    1677             :     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
    1678             :     // placement and doesn't always guarantee convergence for a loop. Each
    1679             :     // loop should take at most 2 iterations for it to converge naturally.
    1680             :     // When this max is reached and result doesn't converge, we force
    1681             :     // convergence by inserting a s_waitcnt at the end of loop footer.
    1682         392 :     if (WaitcntData->getIterCnt() > 2) {
    1683             :       // To ensure convergence, need to make wait events at loop footer be no
    1684             :       // more than those from the previous iteration.
    1685             :       // As a simplification, instead of tracking individual scores and
    1686             :       // generating the precise wait count, just wait on 0.
    1687             :       bool HasPending = false;
    1688           0 :       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
    1689           0 :       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    1690           0 :            T = (enum InstCounterType)(T + 1)) {
    1691           0 :         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    1692             :           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    1693             :           HasPending = true;
    1694             :         }
    1695             :       }
    1696             : 
    1697           0 :       if (HasPending) {
    1698           0 :         if (!SWaitInst) {
    1699           0 :           SWaitInst = Block.getParent()->CreateMachineInstr(
    1700           0 :               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
    1701             :           TrackedWaitcntSet.insert(SWaitInst);
    1702             :           const MachineOperand &Op = MachineOperand::CreateImm(0);
    1703           0 :           SWaitInst->addOperand(MF, Op);
    1704             : #if 0 // TODO: Format the debug output
    1705             :           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
    1706             :           OutputTransformAdd(SWaitInst, context);
    1707             : #endif
    1708             :         }
    1709             : #if 0 // TODO: ??
    1710             :         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
    1711             : #endif
    1712             :       }
    1713             : 
    1714           0 :       if (SWaitInst) {
    1715             :         DEBUG({
    1716             :           SWaitInst->print(dbgs());
    1717             :           dbgs() << "\nAdjusted score board:";
    1718             :           ScoreBrackets->dump();
    1719             :         });
    1720             : 
    1721             :         // Add this waitcnt to the block. It is either newly created or
    1722             :         // created in previous iterations and added back since block traversal
    1723             :         // always removes waitcnts.
    1724           0 :         insertWaitcntBeforeCF(Block, SWaitInst);
    1725           0 :         WaitcntData->setWaitcnt(SWaitInst);
    1726             :       }
    1727             :     }
    1728             :   }
    1729       18956 : }
    1730             : 
    1731       16579 : bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
    1732       16579 :   ST = &MF.getSubtarget<SISubtarget>();
    1733       16579 :   TII = ST->getInstrInfo();
    1734       16579 :   TRI = &TII->getRegisterInfo();
    1735       16579 :   MRI = &MF.getRegInfo();
    1736       16579 :   MLI = &getAnalysis<MachineLoopInfo>();
    1737       33158 :   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
    1738       16579 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1739       33158 :   AMDGPUASI = ST->getAMDGPUAS();
    1740             : 
    1741       16579 :   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
    1742       16579 :   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
    1743       16579 :   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
    1744             : 
    1745       33158 :   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
    1746       33158 :   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
    1747             :   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
    1748             :   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
    1749             : 
    1750       33158 :   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
    1751       16579 :   RegisterEncoding.VGPRL =
    1752       16579 :       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
    1753       16579 :   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
    1754       16579 :   RegisterEncoding.SGPRL =
    1755       16579 :       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
    1756             : 
    1757             :   TrackedWaitcntSet.clear();
    1758             :   BlockVisitedSet.clear();
    1759             :   VCCZBugHandledSet.clear();
    1760             : 
    1761             :   // Walk over the blocks in reverse post-dominator order, inserting
    1762             :   // s_waitcnt where needed.
    1763             :   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
    1764             :   bool Modified = false;
    1765             :   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1766             :            I = RPOT.begin(),
    1767             :            E = RPOT.end(), J = RPOT.begin();
    1768       35535 :        I != E;) {
    1769       18956 :     MachineBasicBlock &MBB = **I;
    1770             : 
    1771       37912 :     BlockVisitedSet.insert(&MBB);
    1772             : 
    1773       37912 :     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1774       18956 :     if (!ScoreBrackets) {
    1775       14194 :       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
    1776       14194 :       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
    1777             :     }
    1778       18956 :     ScoreBrackets->setPostOrder(MBB.getNumber());
    1779       37912 :     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
    1780       19624 :     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
    1781             :       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
    1782             : 
    1783             :     // If we are walking into the block from before the loop, then guarantee
    1784             :     // at least 1 re-walk over the loop to propagate the information, even if
    1785             :     // no S_WAITCNT instructions were generated.
    1786       19624 :     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
    1787             :         (!BlockWaitcntProcessedSet.count(&MBB))) {
    1788         346 :       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
    1789             :       DEBUG(dbgs() << "set-revisit: Block"
    1790             :                    << ContainingLoop->getHeader()->getNumber() << '\n';);
    1791             :     }
    1792             : 
    1793             :     // Walk over the instructions.
    1794       18956 :     insertWaitcntInBlock(MF, MBB);
    1795             : 
    1796             :     // Flag that waitcnts have been processed at least once.
    1797       37912 :     BlockWaitcntProcessedSet.insert(&MBB);
    1798             : 
    1799             :     // See if we want to revisit the loop.
    1800       19624 :     if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
    1801         392 :       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
    1802             :       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
    1803         392 :       if (EntrySB && EntrySB->getRevisitLoop()) {
    1804             :         EntrySB->setRevisitLoop(false);
    1805             :         J = I;
    1806         193 :         int32_t PostOrder = EntrySB->getPostOrder();
    1807             :         // TODO: Avoid this loop. Find another way to set I.
    1808             :         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
    1809             :                  X = RPOT.begin(),
    1810             :                  Y = RPOT.end();
    1811         471 :              X != Y; ++X) {
    1812         471 :           MachineBasicBlock &MBBX = **X;
    1813         471 :           if (MBBX.getNumber() == PostOrder) {
    1814             :             I = X;
    1815             :             break;
    1816             :           }
    1817             :         }
    1818         193 :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1819             :         WaitcntData->incIterCnt();
    1820             :         DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
    1821         193 :         continue;
    1822             :       } else {
    1823         199 :         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
    1824             :         // Loop converged, reset iteration count. If this loop gets revisited,
    1825             :         // it must be from an outer loop, the counter will restart, this will
    1826             :         // ensure we don't force convergence on such revisits.
    1827             :         WaitcntData->resetIterCnt();
    1828             :       }
    1829             :     }
    1830             : 
    1831             :     J = I;
    1832             :     ++I;
    1833             :   }
    1834             : 
    1835             :   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
    1836             : 
    1837             :   bool HaveScalarStores = false;
    1838             : 
    1839       35188 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
    1840             :        ++BI) {
    1841             :     MachineBasicBlock &MBB = *BI;
    1842             : 
    1843      356259 :     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
    1844             :          ++I) {
    1845      637832 :       if (!HaveScalarStores && TII->isScalarStore(*I))
    1846             :         HaveScalarStores = true;
    1847             : 
    1848      638082 :       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
    1849             :           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
    1850       15416 :         EndPgmBlocks.push_back(&MBB);
    1851             :     }
    1852             :   }
    1853             : 
    1854       16579 :   if (HaveScalarStores) {
    1855             :     // If scalar writes are used, the cache must be flushed or else the next
    1856             :     // wave to reuse the same scratch memory can be clobbered.
    1857             :     //
    1858             :     // Insert s_dcache_wb at wave termination points if there were any scalar
    1859             :     // stores, and only if the cache hasn't already been flushed. This could be
    1860             :     // improved by looking across blocks for flushes in postdominating blocks
    1861             :     // from the stores but an explicitly requested flush is probably very rare.
    1862          24 :     for (MachineBasicBlock *MBB : EndPgmBlocks) {
    1863             :       bool SeenDCacheWB = false;
    1864             : 
    1865         133 :       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
    1866             :            ++I) {
    1867         234 :         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
    1868             :           SeenDCacheWB = true;
    1869         117 :         else if (TII->isScalarStore(*I))
    1870             :           SeenDCacheWB = false;
    1871             : 
    1872             :         // FIXME: It would be better to insert this before a waitcnt if any.
    1873         109 :         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
    1874         125 :              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
    1875             :             !SeenDCacheWB) {
    1876             :           Modified = true;
    1877          16 :           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
    1878             :         }
    1879             :       }
    1880             :     }
    1881             :   }
    1882             : 
    1883       16579 :   if (!MFI->isEntryFunction()) {
    1884             :     // Wait for any outstanding memory operations that the input registers may
    1885             :     // depend on. We can't track them and it's better to the wait after the
    1886             :     // costly call sequence.
    1887             : 
    1888             :     // TODO: Could insert earlier and schedule more liberally with operations
    1889             :     // that only use caller preserved registers.
    1890             :     MachineBasicBlock &EntryBB = MF.front();
    1891        4696 :     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    1892             :       .addImm(0);
    1893             : 
    1894             :     Modified = true;
    1895             :   }
    1896             : 
    1897       16579 :   return Modified;
    1898             : }

Generated by: LCOV version 1.13