doxygen/SIInsertWaitcnts%5F8cpp%5Fsource.html

//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Insert wait instructions for memory reads and writes.

///

/// Memory reads and writes are issued asynchronously, so we need to insert

/// S_WAITCNT instructions when we want to access any of their results or

/// overwrite any register that's used asynchronously.

///

/// TODO: This pass currently keeps one timeline per hardware counter. A more

/// finely-grained approach that keeps one timeline per event type could

/// sometimes get away with generating weaker s_waitcnt instructions. For

/// example, when both SMEM and LDS are in flight and we need to wait for

/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,

/// but the pass will currently generate a conservative lgkmcnt(0) because

/// multiple event types are in flight.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/Sequence.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/CodeGen/MachinePassManager.h"

#include "llvm/CodeGen/MachinePostDominators.h"

#include "llvm/IR/Dominators.h"

#include "llvm/InitializePasses.h"

#include "llvm/Support/DebugCounter.h"

#include "llvm/TargetParser/TargetParser.h"


using namespace llvm;


#define DEBUG_TYPE "si-insert-waitcnts"


DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",

              "Force emit s_waitcnt expcnt(0) instrs");

DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",

              "Force emit s_waitcnt lgkmcnt(0) instrs");

DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",

              "Force emit s_waitcnt vmcnt(0) instrs");


static cl::opt<bool>

    ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",

                      cl::desc("Force all waitcnt instrs to be emitted as "

                               "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),

                      cl::init(false), cl::Hidden);


static cl::opt<bool> ForceEmitZeroLoadFlag(

    "amdgpu-waitcnt-load-forcezero",

    cl::desc("Force all waitcnt load counters to wait until 0"),

    cl::init(false), cl::Hidden);


namespace {

// Class of object that encapsulates latest instruction counter score

// associated with the operand.  Used for determining whether

// s_waitcnt instruction needs to be emitted.


enum InstCounterType {

  LOAD_CNT = 0, // VMcnt prior to gfx12.

  DS_CNT,       // LKGMcnt prior to gfx12.

  EXP_CNT,      //

  STORE_CNT,    // VScnt in gfx10/gfx11.

  NUM_NORMAL_INST_CNTS,

  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.

  BVH_CNT,                           // gfx12+ only.

  KM_CNT,                            // gfx12+ only.

  X_CNT,                             // gfx1250.

  NUM_EXTENDED_INST_CNTS,

  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS

};

} // namespace


namespace llvm {


template <> struct enum_iteration_traits<InstCounterType> {

  static constexpr bool is_iterable = true;

};


} // namespace llvm


namespace {

// Return an iterator over all counters between LOAD_CNT (the first counter)

// and \c MaxCounter (exclusive, default value yields an enumeration over

// all counters).

auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {

  return enum_seq(LOAD_CNT, MaxCounter);

}


/// Integer IDs used to track vector memory locations we may have to wait on.

/// Encoded as u16 chunks:

///

///   [0,               REGUNITS_END ): MCRegUnit

///   [LDSDMA_BEGIN,    LDSDMA_END  ) : LDS DMA IDs

///

/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.

/// It gives (2 << 16) - 1 entries per category which is more than enough

/// for all register units. MCPhysReg is u16 so we don't even support >u16

/// physical register numbers at this time, let alone >u16 register units.

/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END

/// is enough for all register units.

using VMEMID = uint32_t;


enum : VMEMID {

  TRACKINGID_RANGE_LEN = (1 << 16),


  // Important: MCRegUnits must always be tracked starting from 0, as we

  // need to be able to convert between a MCRegUnit and a VMEMID freely.

  REGUNITS_BEGIN = 0,

  REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,


  // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"

  // entry, which is updated for all LDS DMA operations encountered.

  // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.

  NUM_LDSDMA = TRACKINGID_RANGE_LEN,

  LDSDMA_BEGIN = REGUNITS_END,

  LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,

};


/// Convert a MCRegUnit to a VMEMID.

static constexpr VMEMID toVMEMID(MCRegUnit RU) {

  return static_cast<unsigned>(RU);

}


struct HardwareLimits {

  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.

  unsigned ExpcntMax;

  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.

  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.

  unsigned SamplecntMax; // gfx12+ only.

  unsigned BvhcntMax;    // gfx12+ only.

  unsigned KmcntMax;     // gfx12+ only.

  unsigned XcntMax;      // gfx1250.

};


#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \

  DECL(VMEM_ACCESS)              /* vmem read & write */                       \

  DECL(VMEM_READ_ACCESS)         /* vmem read */                               \

  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \

  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \

  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \

  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \

  DECL(VMEM_GROUP)               /* vmem group */                              \

  DECL(LDS_ACCESS)               /* lds read & write */                        \

  DECL(GDS_ACCESS)               /* gds read & write */                        \

  DECL(SQ_MESSAGE)               /* send message */                            \

  DECL(SCC_WRITE)                /* write to SCC from barrier */               \

  DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \

  DECL(SMEM_GROUP)               /* scalar-memory group */                     \

  DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \

  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \

  DECL(EXP_POS_ACCESS)           /* write to export position */                \

  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \

  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \

  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */


// clang-format off

#define AMDGPU_EVENT_ENUM(Name) Name,

enum WaitEventType {

  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)

  NUM_WAIT_EVENTS

};

#undef AMDGPU_EVENT_ENUM


#define AMDGPU_EVENT_NAME(Name) #Name,

static constexpr StringLiteral WaitEventTypeName[] = {

  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)

};

#undef AMDGPU_EVENT_NAME

// clang-format on


// Enumerate different types of result-returning VMEM operations. Although

// s_waitcnt orders them all with a single vmcnt counter, in the absence of

// s_waitcnt only instructions of the same VmemType are guaranteed to write

// their results in order -- so there is no need to insert an s_waitcnt between

// two instructions of the same type that write the same vgpr.

enum VmemType {

  // BUF instructions and MIMG instructions without a sampler.

  VMEM_NOSAMPLER,

  // MIMG instructions with a sampler.

  VMEM_SAMPLER,

  // BVH instructions

  VMEM_BVH,

  NUM_VMEM_TYPES

};


// Maps values of InstCounterType to the instruction that waits on that

// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()

// returns true.

static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {

    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,

    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,

    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};


static bool updateVMCntOnly(const MachineInstr &Inst) {

  return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||

         SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst);

}


#ifndef NDEBUG

static bool isNormalMode(InstCounterType MaxCounter) {

  return MaxCounter == NUM_NORMAL_INST_CNTS;

}

#endif // NDEBUG


VmemType getVmemType(const MachineInstr &Inst) {

  assert(updateVMCntOnly(Inst));

  if (!SIInstrInfo::isImage(Inst))

    return VMEM_NOSAMPLER;

  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());

  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =

      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);


  if (BaseInfo->BVH)

    return VMEM_BVH;


  // We have to make an additional check for isVSAMPLE here since some

  // instructions don't have a sampler, but are still classified as sampler

  // instructions for the purposes of e.g. waitcnt.

  if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))

    return VMEM_SAMPLER;


  return VMEM_NOSAMPLER;

}


unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {

  switch (T) {

  case LOAD_CNT:

    return Wait.LoadCnt;

  case EXP_CNT:

    return Wait.ExpCnt;

  case DS_CNT:

    return Wait.DsCnt;

  case STORE_CNT:

    return Wait.StoreCnt;

  case SAMPLE_CNT:

    return Wait.SampleCnt;

  case BVH_CNT:

    return Wait.BvhCnt;

  case KM_CNT:

    return Wait.KmCnt;

  case X_CNT:

    return Wait.XCnt;

  default:

    llvm_unreachable("bad InstCounterType");

  }

}


void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {

  unsigned &WC = getCounterRef(Wait, T);

  WC = std::min(WC, Count);

}


void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {

  getCounterRef(Wait, T) = ~0u;

}


unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {

  return getCounterRef(Wait, T);

}


// Mapping from event to counter according to the table masks.

InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {

  for (auto T : inst_counter_types()) {

    if (masks[T] & (1 << E))

      return T;

  }

  llvm_unreachable("event type has no associated counter");

}


class WaitcntBrackets;


// This abstracts the logic for generating and updating S_WAIT* instructions

// away from the analysis that determines where they are needed. This was

// done because the set of counters and instructions for waiting on them

// underwent a major shift with gfx12, sufficiently so that having this

// abstraction allows the main analysis logic to be simpler than it would

// otherwise have had to become.

class WaitcntGenerator {

protected:

  const GCNSubtarget *ST = nullptr;

  const SIInstrInfo *TII = nullptr;

  AMDGPU::IsaVersion IV;

  InstCounterType MaxCounter;

  bool OptNone;


public:

  WaitcntGenerator() = default;

  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)

      : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),

        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),

        OptNone(MF.getFunction().hasOptNone() ||

                MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}


  // Return true if the current function should be compiled with no

  // optimization.

  bool isOptNone() const { return OptNone; }


  // Edits an existing sequence of wait count instructions according

  // to an incoming Waitcnt value, which is itself updated to reflect

  // any new wait count instructions which may need to be generated by

  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits

  // were made.

  //

  // This editing will usually be merely updated operands, but it may also

  // delete instructions if the incoming Wait value indicates they are not

  // needed. It may also remove existing instructions for which a wait

  // is needed if it can be determined that it is better to generate new

  // instructions later, as can happen on gfx12.

  virtual bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const = 0;


  // Transform a soft waitcnt into a normal one.

  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;


  // Generates new wait count instructions according to the  value of

  // Wait, returning true if any new instructions were created.

  virtual bool createNewWaitcnt(MachineBasicBlock &Block,

                                MachineBasicBlock::instr_iterator It,

                                AMDGPU::Waitcnt Wait) = 0;


  // Returns an array of bit masks which can be used to map values in

  // WaitEventType to corresponding counter values in InstCounterType.

  virtual const unsigned *getWaitEventMask() const = 0;


  // Returns a new waitcnt with all counters except VScnt set to 0. If

  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.

  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;


  virtual ~WaitcntGenerator() = default;


  // Create a mask value from the initializer list of wait event types.

  static constexpr unsigned

  eventMask(std::initializer_list<WaitEventType> Events) {

    unsigned Mask = 0;

    for (auto &E : Events)

      Mask |= 1 << E;


    return Mask;

  }

};


class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {

public:

  using WaitcntGenerator::WaitcntGenerator;


  bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const override;


  bool createNewWaitcnt(MachineBasicBlock &Block,

                        MachineBasicBlock::instr_iterator It,

                        AMDGPU::Waitcnt Wait) override;


  const unsigned *getWaitEventMask() const override {

    assert(ST);


    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {

        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,

                   VMEM_BVH_READ_ACCESS}),

        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),

        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,

                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),

        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

        0,

        0,

        0,

        0};


    return WaitEventMaskForInstPreGFX12;

  }


  AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

};


class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {

public:

  using WaitcntGenerator::WaitcntGenerator;


  bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const override;


  bool createNewWaitcnt(MachineBasicBlock &Block,

                        MachineBasicBlock::instr_iterator It,

                        AMDGPU::Waitcnt Wait) override;


  const unsigned *getWaitEventMask() const override {

    assert(ST);


    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {

        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),

        eventMask({LDS_ACCESS, GDS_ACCESS}),

        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,

                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),

        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

        eventMask({VMEM_SAMPLER_READ_ACCESS}),

        eventMask({VMEM_BVH_READ_ACCESS}),

        eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),

        eventMask({VMEM_GROUP, SMEM_GROUP})};


    return WaitEventMaskForInstGFX12Plus;

  }


  AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

};


class SIInsertWaitcnts {

public:

  const GCNSubtarget *ST;

  const SIInstrInfo *TII = nullptr;

  const SIRegisterInfo *TRI = nullptr;

  const MachineRegisterInfo *MRI = nullptr;

  InstCounterType SmemAccessCounter;

  InstCounterType MaxCounter;

  const unsigned *WaitEventMaskForInst;


private:

  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;

  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;

  MachineLoopInfo *MLI;

  MachinePostDominatorTree *PDT;

  AliasAnalysis *AA = nullptr;


  struct BlockInfo {

    std::unique_ptr<WaitcntBrackets> Incoming;

    bool Dirty = true;

  };


  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;


  bool ForceEmitWaitcnt[NUM_INST_CNTS];


  // In any given run of this pass, WCG will point to one of these two

  // generator objects, which must have been re-initialised before use

  // from a value made using a subtarget constructor.

  WaitcntGeneratorPreGFX12 WCGPreGFX12;

  WaitcntGeneratorGFX12Plus WCGGFX12Plus;


  WaitcntGenerator *WCG = nullptr;


  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS

  // message.

  DenseSet<MachineInstr *> ReleaseVGPRInsts;


  HardwareLimits Limits;


public:

  SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,

                   AliasAnalysis *AA)

      : MLI(MLI), PDT(PDT), AA(AA) {

    (void)ForceExpCounter;

    (void)ForceLgkmCounter;

    (void)ForceVMCounter;

  }


  unsigned getWaitCountMax(InstCounterType T) const {

    switch (T) {

    case LOAD_CNT:

      return Limits.LoadcntMax;

    case DS_CNT:

      return Limits.DscntMax;

    case EXP_CNT:

      return Limits.ExpcntMax;

    case STORE_CNT:

      return Limits.StorecntMax;

    case SAMPLE_CNT:

      return Limits.SamplecntMax;

    case BVH_CNT:

      return Limits.BvhcntMax;

    case KM_CNT:

      return Limits.KmcntMax;

    case X_CNT:

      return Limits.XcntMax;

    default:

      break;

    }

    return 0;

  }


  bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);

  bool isPreheaderToFlush(MachineBasicBlock &MBB,

                          const WaitcntBrackets &ScoreBrackets);

  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;

  bool run(MachineFunction &MF);


  void setForceEmitWaitcnt() {

// For non-debug builds, ForceEmitWaitcnt has been initialized to false;

// For debug builds, get the debug counter info and adjust if need be

#ifndef NDEBUG

    if (DebugCounter::isCounterSet(ForceExpCounter) &&

        DebugCounter::shouldExecute(ForceExpCounter)) {

      ForceEmitWaitcnt[EXP_CNT] = true;

    } else {

      ForceEmitWaitcnt[EXP_CNT] = false;

    }


    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&

        DebugCounter::shouldExecute(ForceLgkmCounter)) {

      ForceEmitWaitcnt[DS_CNT] = true;

      ForceEmitWaitcnt[KM_CNT] = true;

    } else {

      ForceEmitWaitcnt[DS_CNT] = false;

      ForceEmitWaitcnt[KM_CNT] = false;

    }


    if (DebugCounter::isCounterSet(ForceVMCounter) &&

        DebugCounter::shouldExecute(ForceVMCounter)) {

      ForceEmitWaitcnt[LOAD_CNT] = true;

      ForceEmitWaitcnt[SAMPLE_CNT] = true;

      ForceEmitWaitcnt[BVH_CNT] = true;

    } else {

      ForceEmitWaitcnt[LOAD_CNT] = false;

      ForceEmitWaitcnt[SAMPLE_CNT] = false;

      ForceEmitWaitcnt[BVH_CNT] = false;

    }

#endif // NDEBUG

  }


  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM

  // instruction.

  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {

    switch (Inst.getOpcode()) {

    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.

    case AMDGPU::GLOBAL_INV:

      return VMEM_READ_ACCESS; // tracked using loadcnt

    case AMDGPU::GLOBAL_WB:

    case AMDGPU::GLOBAL_WBINV:

      return VMEM_WRITE_ACCESS; // tracked using storecnt

    default:

      break;

    }


    // Maps VMEM access types to their corresponding WaitEventType.

    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {

        VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};


    assert(SIInstrInfo::isVMEM(Inst));

    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side

    // these should use VM_CNT.

    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))

      return VMEM_ACCESS;

    if (Inst.mayStore() &&

        (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {

      if (TII->mayAccessScratch(Inst))

        return SCRATCH_WRITE_ACCESS;

      return VMEM_WRITE_ACCESS;

    }

    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))

      return VMEM_READ_ACCESS;

    return VmemReadMapping[getVmemType(Inst)];

  }


  bool isVmemAccess(const MachineInstr &MI) const;

  bool generateWaitcntInstBefore(MachineInstr &MI,

                                 WaitcntBrackets &ScoreBrackets,

                                 MachineInstr *OldWaitcntInstr,

                                 bool FlushVmCnt);

  bool generateWaitcnt(AMDGPU::Waitcnt Wait,

                       MachineBasicBlock::instr_iterator It,

                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,

                       MachineInstr *OldWaitcntInstr);

  void updateEventWaitcntAfter(MachineInstr &Inst,

                               WaitcntBrackets *ScoreBrackets);

  bool isNextENDPGM(MachineBasicBlock::instr_iterator It,

                    MachineBasicBlock *Block) const;

  bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,

                             WaitcntBrackets &ScoreBrackets);

  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,

                            WaitcntBrackets &ScoreBrackets);

};


// This objects maintains the current score brackets of each wait counter, and

// a per-register scoreboard for each wait counter.

//

// We also maintain the latest score for every event type that can change the

// waitcnt in order to know if there are multiple types of events within

// the brackets. When multiple types of event happen in the bracket,

// wait count may get decreased out of order, therefore we need to put in

// "s_waitcnt 0" before use.

class WaitcntBrackets {

public:

  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {

    assert(Context->TRI->getNumRegUnits() < REGUNITS_END);

  }


#ifndef NDEBUG

  ~WaitcntBrackets() {

    unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;

    for (auto &[ID, Val] : VMem) {

      if (Val.empty())

        ++NumUnusedVmem;

    }

    for (auto &[ID, Val] : SGPRs) {

      if (Val.empty())

        ++NumUnusedSGPRs;

    }


    if (NumUnusedVmem || NumUnusedSGPRs) {

      errs() << "WaitcntBracket had unused entries at destruction time: "

             << NumUnusedVmem << " VMem and " << NumUnusedSGPRs

             << " SGPR unused entries\n";

      std::abort();

    }

  }

#endif


  bool isSmemCounter(InstCounterType T) const {

    return T == Context->SmemAccessCounter || T == X_CNT;

  }


  unsigned getSgprScoresIdx(InstCounterType T) const {

    assert(isSmemCounter(T) && "Invalid SMEM counter");

    return T == X_CNT ? 1 : 0;

  }


  unsigned getScoreLB(InstCounterType T) const {

    assert(T < NUM_INST_CNTS);

    return ScoreLBs[T];

  }


  unsigned getScoreUB(InstCounterType T) const {

    assert(T < NUM_INST_CNTS);

    return ScoreUBs[T];

  }


  unsigned getScoreRange(InstCounterType T) const {

    return getScoreUB(T) - getScoreLB(T);

  }


  unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {

    auto It = SGPRs.find(RU);

    return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;

  }


  unsigned getVMemScore(VMEMID TID, InstCounterType T) const {

    auto It = VMem.find(TID);

    return It != VMem.end() ? It->second.Scores[T] : 0;

  }


  bool merge(const WaitcntBrackets &Other);


  bool counterOutOfOrder(InstCounterType T) const;

  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);

  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;

  bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);

  bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);

  void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);


  void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,

                               AMDGPU::Waitcnt &Wait) const;

  void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,

                              AMDGPU::Waitcnt &Wait) const;

  void tryClearSCCWriteEvent(MachineInstr *Inst);


  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);

  void applyWaitcnt(InstCounterType T, unsigned Count);

  void updateByEvent(WaitEventType E, MachineInstr &MI);


  unsigned hasPendingEvent() const { return PendingEvents; }

  unsigned hasPendingEvent(WaitEventType E) const {

    return PendingEvents & (1 << E);

  }

  unsigned hasPendingEvent(InstCounterType T) const {

    unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];

    assert((HasPending != 0) == (getScoreRange(T) != 0));

    return HasPending;

  }


  bool hasMixedPendingEvents(InstCounterType T) const {

    unsigned Events = hasPendingEvent(T);

    // Return true if more than one bit is set in Events.

    return Events & (Events - 1);

  }


  bool hasPendingFlat() const {

    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&

             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||

            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&

             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));

  }


  void setPendingFlat() {

    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];

    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];

  }


  bool hasPendingGDS() const {

    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];

  }


  unsigned getPendingGDSWait() const {

    return std::min(getScoreUB(DS_CNT) - LastGDS,

                    Context->getWaitCountMax(DS_CNT) - 1);

  }


  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }


  // Return true if there might be pending writes to the vgpr-interval by VMEM

  // instructions with types different from V.

  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {

    for (MCRegUnit RU : regunits(Reg)) {

      auto It = VMem.find(toVMEMID(RU));

      if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))

        return true;

    }

    return false;

  }


  void clearVgprVmemTypes(MCPhysReg Reg) {

    for (MCRegUnit RU : regunits(Reg)) {

      if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {

        It->second.VMEMTypes = 0;

        if (It->second.empty())

          VMem.erase(It);

      }

    }

  }


  void setStateOnFunctionEntryOrReturn() {

    setScoreUB(STORE_CNT,

               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));

    PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];

  }


  ArrayRef<const MachineInstr *> getLDSDMAStores() const {

    return LDSDMAStores;

  }


  bool hasPointSampleAccel(const MachineInstr &MI) const;

  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,

                                      MCPhysReg RU) const;


  void print(raw_ostream &) const;

  void dump() const { print(dbgs()); }


  // Free up memory by removing empty entries from the DenseMap that track event

  // scores.

  void purgeEmptyTrackingData();


private:

  struct MergeInfo {

    unsigned OldLB;

    unsigned OtherLB;

    unsigned MyShift;

    unsigned OtherShift;

  };


  void determineWaitForScore(InstCounterType T, unsigned Score,

                             AMDGPU::Waitcnt &Wait) const;


  static bool mergeScore(const MergeInfo &M, unsigned &Score,

                         unsigned OtherScore);


  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {

    assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");

    if (!Context->TRI->isInAllocatableClass(Reg))

      return {{}, {}};

    const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);

    unsigned Size = Context->TRI->getRegSizeInBits(*RC);

    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())

      Reg = Context->TRI->get32BitRegister(Reg);

    return Context->TRI->regunits(Reg);

  }


  void setScoreLB(InstCounterType T, unsigned Val) {

    assert(T < NUM_INST_CNTS);

    ScoreLBs[T] = Val;

  }


  void setScoreUB(InstCounterType T, unsigned Val) {

    assert(T < NUM_INST_CNTS);

    ScoreUBs[T] = Val;


    if (T != EXP_CNT)

      return;


    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))

      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);

  }


  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {

    const SIRegisterInfo *TRI = Context->TRI;

    if (Reg == AMDGPU::SCC) {

      SCCScore = Val;

    } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {

      for (MCRegUnit RU : regunits(Reg))

        VMem[toVMEMID(RU)].Scores[T] = Val;

    } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {

      auto STy = getSgprScoresIdx(T);

      for (MCRegUnit RU : regunits(Reg))

        SGPRs[RU].Scores[STy] = Val;

    } else {

      llvm_unreachable("Register cannot be tracked/unknown register!");

    }

  }


  void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {

    VMem[TID].Scores[T] = Val;

  }


  void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,

                         unsigned Val);


  const SIInsertWaitcnts *Context;


  unsigned ScoreLBs[NUM_INST_CNTS] = {0};

  unsigned ScoreUBs[NUM_INST_CNTS] = {0};

  unsigned PendingEvents = 0;

  // Remember the last flat memory operation.

  unsigned LastFlat[NUM_INST_CNTS] = {0};

  // Remember the last GDS operation.

  unsigned LastGDS = 0;


  // The score tracking logic is fragmented as follows:

  // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.

  // - SGPRs: SGPR RegUnits

  // - SCC: Non-allocatable and not general purpose: not a SGPR.

  //

  // For the VMem case, if the key is within the range of LDS DMA IDs,

  // then the corresponding index into the `LDSDMAStores` vector below is:

  //   Key - LDSDMA_BEGIN - 1

  // This is because LDSDMA_BEGIN is a generic entry and does not have an

  // associated MachineInstr.

  //

  // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?


  struct VMEMInfo {

    // Scores for all instruction counters.

    std::array<unsigned, NUM_INST_CNTS> Scores = {0};

    // Bitmask of the VmemTypes of VMEM instructions for this VGPR.

    unsigned VMEMTypes = 0;


    bool empty() const {

      return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;

    }

  };


  struct SGPRInfo {

    // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt

    // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.

    // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps

    // the X_CNT score.

    std::array<unsigned, 2> Scores = {0};


    bool empty() const { return !Scores[0] && !Scores[1]; }

  };


  DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA

  DenseMap<MCRegUnit, SGPRInfo> SGPRs;


  // Reg score for SCC.

  unsigned SCCScore = 0;

  // The unique instruction that has an SCC write pending, if there is one.

  const MachineInstr *PendingSCCWrite = nullptr;


  // Store representative LDS DMA operations. The only useful info here is

  // alias info. One store is kept per unique AAInfo.

  SmallVector<const MachineInstr *> LDSDMAStores;

};


class SIInsertWaitcntsLegacy : public MachineFunctionPass {

public:

  static char ID;

  SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}


  bool runOnMachineFunction(MachineFunction &MF) override;


  StringRef getPassName() const override {

    return "SI insert wait instructions";

  }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    AU.addRequired<MachineLoopInfoWrapperPass>();

    AU.addRequired<MachinePostDominatorTreeWrapperPass>();

    AU.addUsedIfAvailable<AAResultsWrapperPass>();

    AU.addPreserved<AAResultsWrapperPass>();

    MachineFunctionPass::getAnalysisUsage(AU);

  }

};


} // end anonymous namespace


void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,

                                        InstCounterType CntTy, unsigned Score) {

  setRegScore(Op.getReg().asMCReg(), CntTy, Score);

}


// Return true if the subtarget is one that enables Point Sample Acceleration

// and the MachineInstr passed in is one to which it might be applied (the

// hardware makes this decision based on several factors, but we can't determine

// this at compile time, so we have to assume it might be applied if the

// instruction supports it).

bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {

  if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))

    return false;


  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());

  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =

      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);

  return BaseInfo->PointSampleAccel;

}


// Return true if the subtarget enables Point Sample Acceleration, the supplied

// MachineInstr is one to which it might be applied and the supplied interval is

// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER

// (this is the type that a point sample accelerated instruction effectively

// becomes)

bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,

                                                     MCPhysReg Reg) const {

  if (!hasPointSampleAccel(MI))

    return false;


  return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);

}


void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {

  InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);

  assert(T < Context->MaxCounter);


  unsigned UB = getScoreUB(T);

  unsigned CurrScore = UB + 1;

  if (CurrScore == 0)

    report_fatal_error("InsertWaitcnt score wraparound");

  // PendingEvents and ScoreUB need to be update regardless if this event

  // changes the score of a register or not.

  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.

  PendingEvents |= 1 << E;

  setScoreUB(T, CurrScore);


  const SIRegisterInfo *TRI = Context->TRI;

  const MachineRegisterInfo *MRI = Context->MRI;

  const SIInstrInfo *TII = Context->TII;


  if (T == EXP_CNT) {

    // Put score on the source vgprs. If this is a store, just use those

    // specific register(s).

    if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {

      // All GDS operations must protect their address register (same as

      // export.)

      if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))

        setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);


      if (Inst.mayStore()) {

        if (const auto *Data0 =

                TII->getNamedOperand(Inst, AMDGPU::OpName::data0))

          setScoreByOperand(*Data0, EXP_CNT, CurrScore);

        if (const auto *Data1 =

                TII->getNamedOperand(Inst, AMDGPU::OpName::data1))

          setScoreByOperand(*Data1, EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&

                 Inst.getOpcode() != AMDGPU::DS_APPEND &&

                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&

                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {

        for (const MachineOperand &Op : Inst.all_uses()) {

          if (TRI->isVectorRegister(*MRI, Op.getReg()))

            setScoreByOperand(Op, EXP_CNT, CurrScore);

        }

      }

    } else if (TII->isFLAT(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII->isMIMG(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII->isMTBUF(Inst)) {

      if (Inst.mayStore())

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

    } else if (TII->isMUBUF(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII->isLDSDIR(Inst)) {

      // LDSDIR instructions attach the score to the destination.

      setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),

                        EXP_CNT, CurrScore);

    } else {

      if (TII->isEXP(Inst)) {

        // For export the destination registers are really temps that

        // can be used as the actual source after export patching, so

        // we need to treat them like sources and set the EXP_CNT

        // score.

        for (MachineOperand &DefMO : Inst.all_defs()) {

          if (TRI->isVGPR(*MRI, DefMO.getReg())) {

            setScoreByOperand(DefMO, EXP_CNT, CurrScore);

          }

        }

      }

      for (const MachineOperand &Op : Inst.all_uses()) {

        if (TRI->isVectorRegister(*MRI, Op.getReg()))

          setScoreByOperand(Op, EXP_CNT, CurrScore);

      }

    }

  } else if (T == X_CNT) {

    WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;

    if (PendingEvents & (1 << OtherEvent)) {

      // Hardware inserts an implicit xcnt between interleaved

      // SMEM and VMEM operations. So there will never be

      // outstanding address translations for both SMEM and

      // VMEM at the same time.

      setScoreLB(T, getScoreUB(T) - 1);

      PendingEvents &= ~(1 << OtherEvent);

    }

    for (const MachineOperand &Op : Inst.all_uses())

      setScoreByOperand(Op, T, CurrScore);

  } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {

    // Match the score to the destination registers.

    //

    // Check only explicit operands. Stores, especially spill stores, include

    // implicit uses and defs of their super registers which would create an

    // artificial dependency, while these are there only for register liveness

    // accounting purposes.

    //

    // Special cases where implicit register defs exists, such as M0 or VCC,

    // but none with memory instructions.

    for (const MachineOperand &Op : Inst.defs()) {

      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {

        if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper

          continue;

        if (updateVMCntOnly(Inst)) {

          // updateVMCntOnly should only leave us with VGPRs

          // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR

          // defs. That's required for a sane index into `VgprMemTypes` below

          assert(TRI->isVectorRegister(*MRI, Op.getReg()));

          VmemType V = getVmemType(Inst);

          unsigned char TypesMask = 1 << V;

          // If instruction can have Point Sample Accel applied, we have to flag

          // this with another potential dependency

          if (hasPointSampleAccel(Inst))

            TypesMask |= 1 << VMEM_NOSAMPLER;

          for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))

            VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;

        }

      }

      setScoreByOperand(Op, T, CurrScore);

    }

    if (Inst.mayStore() &&

        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {

      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS

      // written can be accessed. A load from LDS to VMEM does not need a wait.

      //

      // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then

      // there is a MachineInstr in LDSDMAStores used to track this LDSDMA

      // store. The "Slot" is the index into LDSDMAStores + 1.

      unsigned Slot = 0;

      for (const auto *MemOp : Inst.memoperands()) {

        if (!MemOp->isStore() ||

            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)

          continue;

        // Comparing just AA info does not guarantee memoperands are equal

        // in general, but this is so for LDS DMA in practice.

        auto AAI = MemOp->getAAInfo();

        // Alias scope information gives a way to definitely identify an

        // original memory object and practically produced in the module LDS

        // lowering pass. If there is no scope available we will not be able

        // to disambiguate LDS aliasing as after the module lowering all LDS

        // is squashed into a single big object.

        if (!AAI || !AAI.Scope)

          break;

        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {

          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {

            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {

              Slot = I + 1;

              break;

            }

          }

        }

        if (Slot)

          break;

        // The slot may not be valid because it can be >= NUM_LDSDMA which

        // means the scoreboard cannot track it. We still want to preserve the

        // MI in order to check alias information, though.

        LDSDMAStores.push_back(&Inst);

        Slot = LDSDMAStores.size();

        break;

      }

      setVMemScore(LDSDMA_BEGIN, T, CurrScore);

      if (Slot && Slot < NUM_LDSDMA)

        setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);

    }


    if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {

      setRegScore(AMDGPU::SCC, T, CurrScore);

      PendingSCCWrite = &Inst;

    }

  }

}


void WaitcntBrackets::print(raw_ostream &OS) const {

  const GCNSubtarget *ST = Context->ST;


  OS << '\n';

  for (auto T : inst_counter_types(Context->MaxCounter)) {

    unsigned SR = getScoreRange(T);


    switch (T) {

    case LOAD_CNT:

      OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("

         << SR << "):";

      break;

    case DS_CNT:

      OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("

         << SR << "):";

      break;

    case EXP_CNT:

      OS << "    EXP_CNT(" << SR << "):";

      break;

    case STORE_CNT:

      OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("

         << SR << "):";

      break;

    case SAMPLE_CNT:

      OS << "    SAMPLE_CNT(" << SR << "):";

      break;

    case BVH_CNT:

      OS << "    BVH_CNT(" << SR << "):";

      break;

    case KM_CNT:

      OS << "    KM_CNT(" << SR << "):";

      break;

    case X_CNT:

      OS << "    X_CNT(" << SR << "):";

      break;

    default:

      OS << "    UNKNOWN(" << SR << "):";

      break;

    }


    if (SR != 0) {

      // Print vgpr scores.

      unsigned LB = getScoreLB(T);


      SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());

      sort(SortedVMEMIDs);


      for (auto ID : SortedVMEMIDs) {

        unsigned RegScore = VMem.at(ID).Scores[T];

        if (RegScore <= LB)

          continue;

        unsigned RelScore = RegScore - LB - 1;

        if (ID < REGUNITS_END) {

          OS << ' ' << RelScore << ":vRU" << ID;

        } else {

          assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&

                 "Unhandled/unexpected ID value!");

          OS << ' ' << RelScore << ":LDSDMA" << ID;

        }

      }


      // Also need to print sgpr scores for lgkm_cnt or xcnt.

      if (isSmemCounter(T)) {

        SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());

        sort(SortedSMEMIDs);

        for (auto ID : SortedSMEMIDs) {

          unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];

          if (RegScore <= LB)

            continue;

          unsigned RelScore = RegScore - LB - 1;

          OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);

        }

      }


      if (T == KM_CNT && SCCScore > 0)

        OS << ' ' << SCCScore << ":scc";

    }

    OS << '\n';

  }


  OS << "Pending Events: ";

  if (hasPendingEvent()) {

    ListSeparator LS;

    for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {

      if (hasPendingEvent((WaitEventType)I)) {

        OS << LS << WaitEventTypeName[I];

      }

    }

  } else {

    OS << "none";

  }

  OS << '\n';


  OS << '\n';

}


/// Simplify the waitcnt, in the sense of removing redundant counts, and return

/// whether a waitcnt instruction is needed at all.

void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {

  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);

  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);

  simplifyWaitcnt(DS_CNT, Wait.DsCnt);

  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);

  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);

  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);

  simplifyWaitcnt(KM_CNT, Wait.KmCnt);

  simplifyXcnt(Wait, Wait);

}


void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,

                                      unsigned &Count) const {

  // The number of outstanding events for this type, T, can be calculated

  // as (UB - LB). If the current Count is greater than or equal to the number

  // of outstanding events, then the wait for this counter is redundant.

  if (Count >= getScoreRange(T))

    Count = ~0u;

}


void WaitcntBrackets::purgeEmptyTrackingData() {

  for (auto &[K, V] : make_early_inc_range(VMem)) {

    if (V.empty())

      VMem.erase(K);

  }

  for (auto &[K, V] : make_early_inc_range(SGPRs)) {

    if (V.empty())

      SGPRs.erase(K);

  }

}


void WaitcntBrackets::determineWaitForScore(InstCounterType T,

                                            unsigned ScoreToWait,

                                            AMDGPU::Waitcnt &Wait) const {

  const unsigned LB = getScoreLB(T);

  const unsigned UB = getScoreUB(T);


  // If the score falls within the bracket, we need a waitcnt.

  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {

    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&

        !Context->ST->hasFlatLgkmVMemCountInOrder()) {

      // If there is a pending FLAT operation, and this is a VMem or LGKM

      // waitcnt and the target can report early completion, then we need

      // to force a waitcnt 0.

      addWait(Wait, T, 0);

    } else if (counterOutOfOrder(T)) {

      // Counter can get decremented out-of-order when there

      // are multiple types event in the bracket. Also emit an s_wait counter

      // with a conservative value of 0 for the counter.

      addWait(Wait, T, 0);

    } else {

      // If a counter has been maxed out avoid overflow by waiting for

      // MAX(CounterType) - 1 instead.

      unsigned NeededWait =

          std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);

      addWait(Wait, T, NeededWait);

    }

  }

}


void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,

                                              AMDGPU::Waitcnt &Wait) const {

  if (Reg == AMDGPU::SCC) {

    determineWaitForScore(T, SCCScore, Wait);

  } else {

    bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);

    for (MCRegUnit RU : regunits(Reg))

      determineWaitForScore(

          T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),

          Wait);

  }

}


void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,

                                             AMDGPU::Waitcnt &Wait) const {

  assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);

  determineWaitForScore(T, getVMemScore(TID, T), Wait);

}


void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {

  // S_BARRIER_WAIT on the same barrier guarantees that the pending write to

  // SCC has landed

  if (PendingSCCWrite &&

      PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&

      PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {

    unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;

    // If this SCC_WRITE is the only pending KM_CNT event, clear counter.

    if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==

        SCC_WRITE_PendingEvent) {

      setScoreLB(KM_CNT, getScoreUB(KM_CNT));

    }


    PendingEvents &= ~SCC_WRITE_PendingEvent;

    PendingSCCWrite = nullptr;

  }

}


void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {

  applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

  applyWaitcnt(EXP_CNT, Wait.ExpCnt);

  applyWaitcnt(DS_CNT, Wait.DsCnt);

  applyWaitcnt(STORE_CNT, Wait.StoreCnt);

  applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);

  applyWaitcnt(BVH_CNT, Wait.BvhCnt);

  applyWaitcnt(KM_CNT, Wait.KmCnt);

  applyWaitcnt(X_CNT, Wait.XCnt);

}


void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {

  const unsigned UB = getScoreUB(T);

  if (Count >= UB)

    return;

  if (Count != 0) {

    if (counterOutOfOrder(T))

      return;

    setScoreLB(T, std::max(getScoreLB(T), UB - Count));

  } else {

    setScoreLB(T, UB);

    PendingEvents &= ~Context->WaitEventMaskForInst[T];

  }

}


bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {

  // Wait on XCNT is redundant if we are already waiting for a load to complete.

  // SMEM can return out of order, so only omit XCNT wait if we are waiting till

  // zero.

  return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);

}


bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {

  // If we have pending store we cannot optimize XCnt because we do not wait for

  // stores. VMEM loads retun in order, so if we only have loads XCnt is

  // decremented to the same number as LOADCnt.

  return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&

         !hasPendingEvent(STORE_CNT);

}


void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,

                                   AMDGPU::Waitcnt &UpdateWait) {

  // Try to simplify xcnt further by checking for joint kmcnt and loadcnt

  // optimizations. On entry to a block with multiple predescessors, there may

  // be pending SMEM and VMEM events active at the same time.

  // In such cases, only clear one active event at a time.

  // TODO: Revisit xcnt optimizations for gfx1250.

  if (hasRedundantXCntWithKmCnt(CheckWait)) {

    if (!hasMixedPendingEvents(X_CNT)) {

      applyWaitcnt(X_CNT, 0);

    } else {

      PendingEvents &= ~(1 << SMEM_GROUP);

    }

  } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {

    if (!hasMixedPendingEvents(X_CNT)) {

      applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));

    } else if (CheckWait.LoadCnt == 0) {

      PendingEvents &= ~(1 << VMEM_GROUP);

    }

  }

  simplifyWaitcnt(X_CNT, UpdateWait.XCnt);

}


// Where there are multiple types of event in the bracket of a counter,

// the decrement may go out of order.

bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {

  // Scalar memory read always can go out of order.

  if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||

      (T == X_CNT && hasPendingEvent(SMEM_GROUP)))

    return true;

  return hasMixedPendingEvents(T);

}


INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",

                      false, false)

INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)

INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",

                    false, false)


char SIInsertWaitcntsLegacy::ID = 0;


char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;


FunctionPass *llvm::createSIInsertWaitcntsPass() {

  return new SIInsertWaitcntsLegacy();

}


static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,

                                     unsigned NewEnc) {

  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);

  assert(OpIdx >= 0);


  MachineOperand &MO = MI.getOperand(OpIdx);


  if (NewEnc == MO.getImm())

    return false;


  MO.setImm(NewEnc);

  return true;

}


/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,

/// and if so, which counter it is waiting on.


static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {

  switch (Opcode) {

  case AMDGPU::S_WAIT_LOADCNT:

    return LOAD_CNT;

  case AMDGPU::S_WAIT_EXPCNT:

    return EXP_CNT;

  case AMDGPU::S_WAIT_STORECNT:

    return STORE_CNT;

  case AMDGPU::S_WAIT_SAMPLECNT:

    return SAMPLE_CNT;

  case AMDGPU::S_WAIT_BVHCNT:

    return BVH_CNT;

  case AMDGPU::S_WAIT_DSCNT:

    return DS_CNT;

  case AMDGPU::S_WAIT_KMCNT:

    return KM_CNT;

  case AMDGPU::S_WAIT_XCNT:

    return X_CNT;

  default:

    return {};

  }

}


bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {

  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());

  if (Opcode == Waitcnt->getOpcode())

    return false;


  Waitcnt->setDesc(TII->get(Opcode));

  return true;

}


/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that

/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits

/// from \p Wait that were added by previous passes. Currently this pass

/// conservatively assumes that these preexisting waits are required for

/// correctness.

bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(

    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {

  assert(ST);

  assert(isNormalMode(MaxCounter));


  bool Modified = false;

  MachineInstr *WaitcntInstr = nullptr;

  MachineInstr *WaitcntVsCntInstr = nullptr;


  LLVM_DEBUG({

    dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";

    if (It == OldWaitcntInstr.getParent()->instr_end())

      dbgs() << "end of block\n";

    else

      dbgs() << *It;

  });


  for (auto &II :

       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {

    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);

    if (II.isMetaInstruction()) {

      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");

      continue;

    }


    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());

    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;


    // Update required wait count. If this is a soft waitcnt (= it was added

    // by an earlier pass), it may be entirely removed.

    if (Opcode == AMDGPU::S_WAITCNT) {

      unsigned IEnc = II.getOperand(0).getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(OldWait);

      Wait = Wait.combined(OldWait);


      // Merge consecutive waitcnt of the same type by erasing multiples.

      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {

        II.eraseFromParent();

        Modified = true;

      } else

        WaitcntInstr = &II;

    } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

      assert(ST->hasVMemToLDSLoad());

      LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II

                        << "Before: " << Wait << '\n';);

      ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);

      LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);


      // It is possible (but unlikely) that this is the only wait instruction,

      // in which case, we exit this loop without a WaitcntInstr to consume

      // `Wait`. But that works because `Wait` was passed in by reference, and

      // the callee eventually calls createNewWaitcnt on it. We test this

      // possibility in an articial MIR test since such a situation cannot be

      // recreated by running the memory legalizer.

      II.eraseFromParent();

    } else {

      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);

      assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);


      unsigned OldVSCnt =

          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);

      Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);


      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {

        II.eraseFromParent();

        Modified = true;

      } else

        WaitcntVsCntInstr = &II;

    }

  }


  if (WaitcntInstr) {

    Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,

                                         AMDGPU::encodeWaitcnt(IV, Wait));

    Modified |= promoteSoftWaitCnt(WaitcntInstr);


    ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);

    ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

    Wait.LoadCnt = ~0u;

    Wait.ExpCnt = ~0u;

    Wait.DsCnt = ~0u;


    LLVM_DEBUG(It == WaitcntInstr->getParent()->end()

                   ? dbgs()

                         << "applied pre-existing waitcnt\n"

                         << "New Instr at block end: " << *WaitcntInstr << '\n'

                   : dbgs() << "applied pre-existing waitcnt\n"

                            << "Old Instr: " << *It

                            << "New Instr: " << *WaitcntInstr << '\n');

  }


  if (WaitcntVsCntInstr) {

    Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,

                                         AMDGPU::OpName::simm16, Wait.StoreCnt);

    Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);


    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);

    Wait.StoreCnt = ~0u;


    LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()

                   ? dbgs() << "applied pre-existing waitcnt\n"

                            << "New Instr at block end: " << *WaitcntVsCntInstr

                            << '\n'

                   : dbgs() << "applied pre-existing waitcnt\n"

                            << "Old Instr: " << *It

                            << "New Instr: " << *WaitcntVsCntInstr << '\n');

  }


  return Modified;

}


/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any

/// required counters in \p Wait

bool WaitcntGeneratorPreGFX12::createNewWaitcnt(

    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,

    AMDGPU::Waitcnt Wait) {

  assert(ST);

  assert(isNormalMode(MaxCounter));


  bool Modified = false;

  const DebugLoc &DL = Block.findDebugLoc(It);


  // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a

  // single instruction while VScnt has its own instruction.

  if (Wait.hasWaitExceptStoreCnt()) {

    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);

    [[maybe_unused]] auto SWaitInst =

        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);

    Modified = true;


    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";

               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

               dbgs() << "New Instr: " << *SWaitInst << '\n');

  }


  if (Wait.hasWaitStoreCnt()) {

    assert(ST->hasVscnt());


    [[maybe_unused]] auto SWaitInst =

        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))

            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)

            .addImm(Wait.StoreCnt);

    Modified = true;


    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";

               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

               dbgs() << "New Instr: " << *SWaitInst << '\n');

  }


  return Modified;

}


AMDGPU::Waitcnt

WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {

  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);

}


AMDGPU::Waitcnt

WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {

  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,

                         ~0u /* XCNT */);

}


/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and

/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that

/// were added by previous passes. Currently this pass conservatively

/// assumes that these preexisting waits are required for correctness.

bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(

    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {

  assert(ST);

  assert(!isNormalMode(MaxCounter));


  bool Modified = false;

  MachineInstr *CombinedLoadDsCntInstr = nullptr;

  MachineInstr *CombinedStoreDsCntInstr = nullptr;

  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};


  LLVM_DEBUG({

    dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";

    if (It == OldWaitcntInstr.getParent()->instr_end())

      dbgs() << "end of block\n";

    else

      dbgs() << *It;

  });


  for (auto &II :

       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {

    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);

    if (II.isMetaInstruction()) {

      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");

      continue;

    }


    MachineInstr **UpdatableInstr;


    // Update required wait count. If this is a soft waitcnt (= it was added

    // by an earlier pass), it may be entirely removed.


    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());

    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;


    // Don't crash if the programmer used legacy waitcnt intrinsics, but don't

    // attempt to do more than that either.

    if (Opcode == AMDGPU::S_WAITCNT)

      continue;


    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {

      unsigned OldEnc =

          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(OldWait);

      Wait = Wait.combined(OldWait);

      UpdatableInstr = &CombinedLoadDsCntInstr;

    } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {

      unsigned OldEnc =

          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(OldWait);

      Wait = Wait.combined(OldWait);

      UpdatableInstr = &CombinedStoreDsCntInstr;

    } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

      // Architectures higher than GFX10 do not have direct loads to

      // LDS, so no work required here yet.

      II.eraseFromParent();

      continue;

    } else {

      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);

      assert(CT.has_value());

      unsigned OldCnt =

          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);

      addWait(Wait, CT.value(), OldCnt);

      UpdatableInstr = &WaitInstrs[CT.value()];

    }


    // Merge consecutive waitcnt of the same type by erasing multiples.

    if (!*UpdatableInstr) {

      *UpdatableInstr = &II;

    } else {

      II.eraseFromParent();

      Modified = true;

    }

  }


  // Save the pre combine waitcnt in order to make xcnt checks.

  AMDGPU::Waitcnt PreCombine = Wait;

  if (CombinedLoadDsCntInstr) {

    // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need

    // to be waited for. Otherwise, let the instruction be deleted so

    // the appropriate single counter wait instruction can be inserted

    // instead, when new S_WAIT_*CNT instructions are inserted by

    // createNewWaitcnt(). As a side effect, resetting the wait counts will

    // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by

    // the loop below that deals with single counter instructions.

    if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {

      unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);

      Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,

                                           AMDGPU::OpName::simm16, NewEnc);

      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);

      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

      Wait.LoadCnt = ~0u;

      Wait.DsCnt = ~0u;


      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()

                     ? dbgs() << "applied pre-existing waitcnt\n"

                              << "New Instr at block end: "

                              << *CombinedLoadDsCntInstr << '\n'

                     : dbgs() << "applied pre-existing waitcnt\n"

                              << "Old Instr: " << *It << "New Instr: "

                              << *CombinedLoadDsCntInstr << '\n');

    } else {

      CombinedLoadDsCntInstr->eraseFromParent();

      Modified = true;

    }

  }


  if (CombinedStoreDsCntInstr) {

    // Similarly for S_WAIT_STORECNT_DSCNT.

    if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {

      unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);

      Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,

                                           AMDGPU::OpName::simm16, NewEnc);

      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);

      ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);

      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

      Wait.StoreCnt = ~0u;

      Wait.DsCnt = ~0u;


      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()

                     ? dbgs() << "applied pre-existing waitcnt\n"

                              << "New Instr at block end: "

                              << *CombinedStoreDsCntInstr << '\n'

                     : dbgs() << "applied pre-existing waitcnt\n"

                              << "Old Instr: " << *It << "New Instr: "

                              << *CombinedStoreDsCntInstr << '\n');

    } else {

      CombinedStoreDsCntInstr->eraseFromParent();

      Modified = true;

    }

  }


  // Look for an opportunity to convert existing S_WAIT_LOADCNT,

  // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT

  // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing

  // instructions so that createNewWaitcnt() will create new combined

  // instructions to replace them.


  if (Wait.DsCnt != ~0u) {

    // This is a vector of addresses in WaitInstrs pointing to instructions

    // that should be removed if they are present.

    SmallVector<MachineInstr **, 2> WaitsToErase;


    // If it's known that both DScnt and either LOADcnt or STOREcnt (but not

    // both) need to be waited for, ensure that there are no existing

    // individual wait count instructions for these.


    if (Wait.LoadCnt != ~0u) {

      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);

      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

    } else if (Wait.StoreCnt != ~0u) {

      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);

      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

    }


    for (MachineInstr **WI : WaitsToErase) {

      if (!*WI)

        continue;


      (*WI)->eraseFromParent();

      *WI = nullptr;

      Modified = true;

    }

  }


  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

    if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||

        (CT == LOAD_CNT &&

         ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {

      // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT

      // due to taking the backedge of a block.

      ScoreBrackets.simplifyXcnt(PreCombine, Wait);

    }

    if (!WaitInstrs[CT])

      continue;


    unsigned NewCnt = getWait(Wait, CT);

    if (NewCnt != ~0u) {

      Modified |= updateOperandIfDifferent(*WaitInstrs[CT],

                                           AMDGPU::OpName::simm16, NewCnt);

      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);


      ScoreBrackets.applyWaitcnt(CT, NewCnt);

      setNoWait(Wait, CT);


      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()

                     ? dbgs() << "applied pre-existing waitcnt\n"

                              << "New Instr at block end: " << *WaitInstrs[CT]

                              << '\n'

                     : dbgs() << "applied pre-existing waitcnt\n"

                              << "Old Instr: " << *It

                              << "New Instr: " << *WaitInstrs[CT] << '\n');

    } else {

      WaitInstrs[CT]->eraseFromParent();

      Modified = true;

    }

  }


  return Modified;

}


/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait

bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(

    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,

    AMDGPU::Waitcnt Wait) {

  assert(ST);

  assert(!isNormalMode(MaxCounter));


  bool Modified = false;

  const DebugLoc &DL = Block.findDebugLoc(It);


  // Check for opportunities to use combined wait instructions.

  if (Wait.DsCnt != ~0u) {

    MachineInstr *SWaitInst = nullptr;


    if (Wait.LoadCnt != ~0u) {

      unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);


      SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))

                      .addImm(Enc);


      Wait.LoadCnt = ~0u;

      Wait.DsCnt = ~0u;

    } else if (Wait.StoreCnt != ~0u) {

      unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);


      SWaitInst =

          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))

              .addImm(Enc);


      Wait.StoreCnt = ~0u;

      Wait.DsCnt = ~0u;

    }


    if (SWaitInst) {

      Modified = true;


      LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";

                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

                 dbgs() << "New Instr: " << *SWaitInst << '\n');

    }

  }


  // Generate an instruction for any remaining counter that needs

  // waiting for.


  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

    unsigned Count = getWait(Wait, CT);

    if (Count == ~0u)

      continue;


    [[maybe_unused]] auto SWaitInst =

        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))

            .addImm(Count);


    Modified = true;


    LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";

               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

               dbgs() << "New Instr: " << *SWaitInst << '\n');

  }


  return Modified;

}


/// \returns true if the callee inserts an s_waitcnt 0 on function entry.


static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {

  // Currently all conventions wait, but this may not always be the case.

  //

  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make

  // senses to omit the wait and do it in the caller.

  return true;

}


/// \returns true if the callee is expected to wait for any outstanding waits

/// before returning.

static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }


///  Generate s_waitcnt instruction to be placed before cur_Inst.

///  Instructions of a given type are returned in order,

///  but instructions of different types can complete out of order.

///  We rely on this in-order completion

///  and simply assign a score to the memory access instructions.

///  We keep track of the active "score bracket" to determine

///  if an access of a memory read requires an s_waitcnt

///  and if so what the value of each counter is.

///  The "score bracket" is bound by the lower bound and upper bound

///  scores (*_score_LB and *_score_ub respectively).

///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to

///  flush the vmcnt counter here.

bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,

                                                 WaitcntBrackets &ScoreBrackets,

                                                 MachineInstr *OldWaitcntInstr,

                                                 bool FlushVmCnt) {

  setForceEmitWaitcnt();


  assert(!MI.isMetaInstruction());


  AMDGPU::Waitcnt Wait;

  const unsigned Opc = MI.getOpcode();


  // FIXME: This should have already been handled by the memory legalizer.

  // Removing this currently doesn't affect any lit tests, but we need to

  // verify that nothing was relying on this. The number of buffer invalidates

  // being handled here should not be expanded.

  if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||

      Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||

      Opc == AMDGPU::BUFFER_GL1_INV) {

    Wait.LoadCnt = 0;

  }


  // All waits must be resolved at call return.

  // NOTE: this could be improved with knowledge of all call sites or

  //   with knowledge of the called routines.

  if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||

      Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||

      Opc == AMDGPU::S_SETPC_B64_return ||

      (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {

    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));

  }

  // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.

  // Technically the hardware will do this on its own if we don't, but that

  // might cost extra cycles compared to doing it explicitly.

  // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may

  // have to wait for outstanding VMEM stores. In this case it can be useful to

  // send a message to explicitly release all VGPRs before the stores have

  // completed, but it is only safe to do this if there are no outstanding

  // scratch stores.

  else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {

    if (!WCG->isOptNone() &&

        (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||

         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&

          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&

          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))

      ReleaseVGPRInsts.insert(&MI);

  }

  // Resolve vm waits before gs-done.

  else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&

           ST->hasLegacyGeometry() &&

           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==

            AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {

    Wait.LoadCnt = 0;

  }


  // Export & GDS instructions do not read the EXEC mask until after the export

  // is granted (which can occur well after the instruction is issued).

  // The shader program must flush all EXP operations on the export-count

  // before overwriting the EXEC mask.

  else {

    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {

      // Export and GDS are tracked individually, either may trigger a waitcnt

      // for EXEC.

      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||

          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||

          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||

          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {

        Wait.ExpCnt = 0;

      }

    }


    // Wait for any pending GDS instruction to complete before any

    // "Always GDS" instruction.

    if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())

      addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());


    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {

      // The function is going to insert a wait on everything in its prolog.

      // This still needs to be careful if the call target is a load (e.g. a GOT

      // load). We also need to check WAW dependency with saved PC.

      Wait = AMDGPU::Waitcnt();


      const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);

      if (CallAddrOp.isReg()) {

        ScoreBrackets.determineWaitForPhysReg(

            SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);


        if (const auto *RtnAddrOp =

                TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {

          ScoreBrackets.determineWaitForPhysReg(

              SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);

        }

      }

    } else if (Opc == AMDGPU::S_BARRIER_WAIT) {

      ScoreBrackets.tryClearSCCWriteEvent(&MI);

    } else {

      // FIXME: Should not be relying on memoperands.

      // Look at the source operands of every instruction to see if

      // any of them results from a previous memory operation that affects

      // its current usage. If so, an s_waitcnt instruction needs to be

      // emitted.

      // If the source operand was defined by a load, add the s_waitcnt

      // instruction.

      //

      // Two cases are handled for destination operands:

      // 1) If the destination operand was defined by a load, add the s_waitcnt

      // instruction to guarantee the right WAW order.

      // 2) If a destination operand that was used by a recent export/store ins,

      // add s_waitcnt on exp_cnt to guarantee the WAR order.


      for (const MachineMemOperand *Memop : MI.memoperands()) {

        const Value *Ptr = Memop->getValue();

        if (Memop->isStore()) {

          if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {

            addWait(Wait, SmemAccessCounter, 0);

            if (PDT->dominates(MI.getParent(), It->second))

              SLoadAddresses.erase(It);

          }

        }

        unsigned AS = Memop->getAddrSpace();

        if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)

          continue;

        // No need to wait before load from VMEM to LDS.

        if (TII->mayWriteLDSThroughDMA(MI))

          continue;


        // LOAD_CNT is only relevant to vgpr or LDS.

        unsigned TID = LDSDMA_BEGIN;

        if (Ptr && Memop->getAAInfo()) {

          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();

          for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {

            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {

              if ((I + 1) >= NUM_LDSDMA) {

                // We didn't have enough slot to track this LDS DMA store, it

                // has been tracked using the common RegNo (FIRST_LDS_VGPR).

                ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);

                break;

              }


              ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);

            }

          }

        } else {

          ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);

        }

        if (Memop->isStore()) {

          ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);

        }

      }


      // Loop over use and def operands.

      for (const MachineOperand &Op : MI.operands()) {

        if (!Op.isReg())

          continue;


        // If the instruction does not read tied source, skip the operand.

        if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))

          continue;


        MCPhysReg Reg = Op.getReg().asMCReg();


        const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());

        if (IsVGPR) {

          // Implicit VGPR defs and uses are never a part of the memory

          // instructions description and usually present to account for

          // super-register liveness.

          // TODO: Most of the other instructions also have implicit uses

          // for the liveness accounting only.

          if (Op.isImplicit() && MI.mayLoadOrStore())

            continue;


          // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the

          // previous write and this write are the same type of VMEM

          // instruction, in which case they are (in some architectures)

          // guaranteed to write their results in order anyway.

          // Additionally check instructions where Point Sample Acceleration

          // might be applied.

          if (Op.isUse() || !updateVMCntOnly(MI) ||

              ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||

              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||

              !ST->hasVmemWriteVgprInOrder()) {

            ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);

            ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);

            ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);

            ScoreBrackets.clearVgprVmemTypes(Reg);

          }


          if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {

            ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);

          }

          ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);

        } else if (Op.getReg() == AMDGPU::SCC) {

          ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);

        } else {

          ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);

        }


        if (ST->hasWaitXCnt() && Op.isDef())

          ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);

      }

    }

  }


  // Ensure safety against exceptions from outstanding memory operations while

  // waiting for a barrier:

  //

  //  * Some subtargets safely handle backing off the barrier in hardware

  //    when an exception occurs.

  //  * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that

  //    there can be no outstanding memory operations during the wait.

  //  * Subtargets with split barriers don't need to back off the barrier; it

  //    is up to the trap handler to preserve the user barrier state correctly.

  //

  // In all other cases, ensure safety by ensuring that there are no outstanding

  // memory operations.

  if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&

      !ST->supportsBackOffBarrier()) {

    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));

  }


  // TODO: Remove this work-around, enable the assert for Bug 457939

  //       after fixing the scheduler. Also, the Shader Compiler code is

  //       independent of target.

  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&

      ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {

    Wait.DsCnt = 0;

  }


  // Verify that the wait is actually needed.

  ScoreBrackets.simplifyWaitcnt(Wait);


  // Since the translation for VMEM addresses occur in-order, we can apply the

  // XCnt if the current instruction is of VMEM type and has a memory

  // dependency with another VMEM instruction in flight.

  if (Wait.XCnt != ~0u && isVmemAccess(MI)) {

    ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);

    Wait.XCnt = ~0u;

  }


  // When forcing emit, we need to skip terminators because that would break the

  // terminators of the MBB if we emit a waitcnt between terminators.

  if (ForceEmitZeroFlag && !MI.isTerminator())

    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);


  if (ForceEmitWaitcnt[LOAD_CNT])

    Wait.LoadCnt = 0;

  if (ForceEmitWaitcnt[EXP_CNT])

    Wait.ExpCnt = 0;

  if (ForceEmitWaitcnt[DS_CNT])

    Wait.DsCnt = 0;

  if (ForceEmitWaitcnt[SAMPLE_CNT])

    Wait.SampleCnt = 0;

  if (ForceEmitWaitcnt[BVH_CNT])

    Wait.BvhCnt = 0;

  if (ForceEmitWaitcnt[KM_CNT])

    Wait.KmCnt = 0;

  if (ForceEmitWaitcnt[X_CNT])

    Wait.XCnt = 0;


  if (FlushVmCnt) {

    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))

      Wait.LoadCnt = 0;

    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))

      Wait.SampleCnt = 0;

    if (ScoreBrackets.hasPendingEvent(BVH_CNT))

      Wait.BvhCnt = 0;

  }


  if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)

    Wait.LoadCnt = 0;


  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,

                         OldWaitcntInstr);

}


bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,

                                       MachineBasicBlock::instr_iterator It,

                                       MachineBasicBlock &Block,

                                       WaitcntBrackets &ScoreBrackets,

                                       MachineInstr *OldWaitcntInstr) {

  bool Modified = false;


  if (OldWaitcntInstr)

    // Try to merge the required wait with preexisting waitcnt instructions.

    // Also erase redundant waitcnt.

    Modified =

        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);


  // Any counts that could have been applied to any existing waitcnt

  // instructions will have been done so, now deal with any remaining.

  ScoreBrackets.applyWaitcnt(Wait);


  // ExpCnt can be merged into VINTERP.

  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&

      SIInstrInfo::isVINTERP(*It)) {

    MachineOperand *WaitExp =

        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);

    if (Wait.ExpCnt < WaitExp->getImm()) {

      WaitExp->setImm(Wait.ExpCnt);

      Modified = true;

    }

    Wait.ExpCnt = ~0u;


    LLVM_DEBUG(dbgs() << "generateWaitcnt\n"

                      << "Update Instr: " << *It);

  }


  if (WCG->createNewWaitcnt(Block, It, Wait))

    Modified = true;


  return Modified;

}


bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {

  return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||

         (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

}


// Return true if the next instruction is S_ENDPGM, following fallthrough

// blocks if necessary.

bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,

                                    MachineBasicBlock *Block) const {

  auto BlockEnd = Block->getParent()->end();

  auto BlockIter = Block->getIterator();


  while (true) {

    if (It.isEnd()) {

      if (++BlockIter != BlockEnd) {

        It = BlockIter->instr_begin();

        continue;

      }


      return false;

    }


    if (!It->isMetaInstruction())

      break;


    It++;

  }


  assert(!It.isEnd());


  return It->getOpcode() == AMDGPU::S_ENDPGM;

}


// Add a wait after an instruction if architecture requirements mandate one.

bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,

                                             MachineBasicBlock &Block,

                                             WaitcntBrackets &ScoreBrackets) {

  AMDGPU::Waitcnt Wait;

  bool NeedsEndPGMCheck = false;


  if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())

    Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&

                                  !SIInstrInfo::isAtomicRet(Inst));


  if (TII->isAlwaysGDS(Inst.getOpcode())) {

    Wait.DsCnt = 0;

    NeedsEndPGMCheck = true;

  }


  ScoreBrackets.simplifyWaitcnt(Wait);


  auto SuccessorIt = std::next(Inst.getIterator());

  bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,

                                /*OldWaitcntInstr=*/nullptr);


  if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {

    BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))

        .addImm(0);

  }


  return Result;

}


void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,

                                               WaitcntBrackets *ScoreBrackets) {

  // Now look at the instruction opcode. If it is a memory access

  // instruction, update the upper-bound of the appropriate counter's

  // bracket and the destination operand scores.

  // For architectures with X_CNT, mark the source address operands

  // with the appropriate counter values.

  // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.


  bool IsVMEMAccess = false;

  bool IsSMEMAccess = false;

  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {

    if (TII->isAlwaysGDS(Inst.getOpcode()) ||

        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {

      ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);

      ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);

      ScoreBrackets->setPendingGDS();

    } else {

      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);

    }

  } else if (TII->isFLAT(Inst)) {

    if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {

      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);

      return;

    }


    assert(Inst.mayLoadOrStore());


    int FlatASCount = 0;


    if (TII->mayAccessVMEMThroughFlat(Inst)) {

      ++FlatASCount;

      IsVMEMAccess = true;

      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);

    }


    if (TII->mayAccessLDSThroughFlat(Inst)) {

      ++FlatASCount;

      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);

    }


    // Async/LDSDMA operations have FLAT encoding but do not actually use flat

    // pointers. They do have two operands that each access global and LDS, thus

    // making it appear at this point that they are using a flat pointer. Filter

    // them out, and for the rest, generate a dependency on flat pointers so

    // that both VM and LGKM counters are flushed.

    if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)

      ScoreBrackets->setPendingFlat();

  } else if (SIInstrInfo::isVMEM(Inst) &&

             !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {

    IsVMEMAccess = true;

    ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);


    if (ST->vmemWriteNeedsExpWaitcnt() &&

        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {

      ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);

    }

  } else if (TII->isSMRD(Inst)) {

    IsSMEMAccess = true;

    ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);

  } else if (Inst.isCall()) {

    if (callWaitsOnFunctionReturn(Inst)) {

      // Act as a wait on everything

      ScoreBrackets->applyWaitcnt(

          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));

      ScoreBrackets->setStateOnFunctionEntryOrReturn();

    } else {

      // May need to way wait for anything.

      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());

    }

  } else if (SIInstrInfo::isLDSDIR(Inst)) {

    ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);

  } else if (TII->isVINTERP(Inst)) {

    int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();

    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);

  } else if (SIInstrInfo::isEXP(Inst)) {

    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();

    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)

      ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);

    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)

      ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);

    else

      ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);

  } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {

    ScoreBrackets->updateByEvent(SCC_WRITE, Inst);

  } else {

    switch (Inst.getOpcode()) {

    case AMDGPU::S_SENDMSG:

    case AMDGPU::S_SENDMSG_RTN_B32:

    case AMDGPU::S_SENDMSG_RTN_B64:

    case AMDGPU::S_SENDMSGHALT:

      ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);

      break;

    case AMDGPU::S_MEMTIME:

    case AMDGPU::S_MEMREALTIME:

    case AMDGPU::S_GET_BARRIER_STATE_M0:

    case AMDGPU::S_GET_BARRIER_STATE_IMM:

      ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);

      break;

    }

  }


  if (!ST->hasWaitXCnt())

    return;


  if (IsVMEMAccess)

    ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);


  if (IsSMEMAccess)

    ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);

}


bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,

                                 unsigned OtherScore) {

  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;

  unsigned OtherShifted =

      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;

  Score = std::max(MyShifted, OtherShifted);

  return OtherShifted > MyShifted;

}


/// Merge the pending events and associater score brackets of \p Other into

/// this brackets status.

///

/// Returns whether the merge resulted in a change that requires tighter waits

/// (i.e. the merged brackets strictly dominate the original brackets).

bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {

  bool StrictDom = false;


  // Check if "other" has keys we don't have, and create default entries for

  // those. If they remain empty after merging, we will clean it up after.

  for (auto K : Other.VMem.keys())

    VMem.try_emplace(K);

  for (auto K : Other.SGPRs.keys())

    SGPRs.try_emplace(K);


  for (auto T : inst_counter_types(Context->MaxCounter)) {

    // Merge event flags for this counter

    const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;

    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];

    const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];

    if (OtherEvents & ~OldEvents)

      StrictDom = true;

    PendingEvents |= OtherEvents;


    // Merge scores for this counter

    const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];

    const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];

    const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);

    if (NewUB < ScoreLBs[T])

      report_fatal_error("waitcnt score overflow");


    MergeInfo M;

    M.OldLB = ScoreLBs[T];

    M.OtherLB = Other.ScoreLBs[T];

    M.MyShift = NewUB - ScoreUBs[T];

    M.OtherShift = NewUB - Other.ScoreUBs[T];


    ScoreUBs[T] = NewUB;


    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);


    if (T == DS_CNT)

      StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);


    if (T == KM_CNT) {

      StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);

      if (Other.hasPendingEvent(SCC_WRITE)) {

        unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);

        if (!OldEventsHasSCCWrite) {

          PendingSCCWrite = Other.PendingSCCWrite;

        } else if (PendingSCCWrite != Other.PendingSCCWrite) {

          PendingSCCWrite = nullptr;

        }

      }

    }


    for (auto &[RegID, Info] : VMem)

      StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));


    if (isSmemCounter(T)) {

      unsigned Idx = getSgprScoresIdx(T);

      for (auto &[RegID, Info] : SGPRs) {

        auto It = Other.SGPRs.find(RegID);

        unsigned OtherScore =

            (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;

        StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);

      }

    }

  }


  for (auto &[TID, Info] : VMem) {

    if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {

      unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;

      StrictDom |= NewVmemTypes != Info.VMEMTypes;

      Info.VMEMTypes = NewVmemTypes;

    }

  }


  purgeEmptyTrackingData();

  return StrictDom;

}


static bool isWaitInstr(MachineInstr &Inst) {

  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());

  return Opcode == AMDGPU::S_WAITCNT ||

         (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&

          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||

         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||

         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||

         Opcode == AMDGPU::S_WAITCNT_lds_direct ||

         counterTypeForInstr(Opcode).has_value();

}


// Generate s_waitcnt instructions where needed.

bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

                                            MachineBasicBlock &Block,

                                            WaitcntBrackets &ScoreBrackets) {

  bool Modified = false;


  LLVM_DEBUG({

    dbgs() << "*** Begin Block: ";

    Block.printName(dbgs());

    ScoreBrackets.dump();

  });


  // Track the correctness of vccz through this basic block. There are two

  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and

  // ST->partialVCCWritesUpdateVCCZ().

  bool VCCZCorrect = true;

  if (ST->hasReadVCCZBug()) {

    // vccz could be incorrect at a basic block boundary if a predecessor wrote

    // to vcc and then issued an smem load.

    VCCZCorrect = false;

  } else if (!ST->partialVCCWritesUpdateVCCZ()) {

    // vccz could be incorrect at a basic block boundary if a predecessor wrote

    // to vcc_lo or vcc_hi.

    VCCZCorrect = false;

  }


  // Walk over the instructions.

  MachineInstr *OldWaitcntInstr = nullptr;


  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),

                                         E = Block.instr_end();

       Iter != E;) {

    MachineInstr &Inst = *Iter;

    if (Inst.isMetaInstruction()) {

      ++Iter;

      continue;

    }


    // Track pre-existing waitcnts that were added in earlier iterations or by

    // the memory legalizer.

    if (isWaitInstr(Inst)) {

      if (!OldWaitcntInstr)

        OldWaitcntInstr = &Inst;

      ++Iter;

      continue;

    }


    bool FlushVmCnt = Block.getFirstTerminator() == Inst &&

                      isPreheaderToFlush(Block, ScoreBrackets);


    // Generate an s_waitcnt instruction to be placed before Inst, if needed.

    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,

                                          FlushVmCnt);

    OldWaitcntInstr = nullptr;


    // Restore vccz if it's not known to be correct already.

    bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);


    // Don't examine operands unless we need to track vccz correctness.

    if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {

      if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||

          Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {

        // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.

        if (!ST->partialVCCWritesUpdateVCCZ())

          VCCZCorrect = false;

      } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {

        // There is a hardware bug on CI/SI where SMRD instruction may corrupt

        // vccz bit, so when we detect that an instruction may read from a

        // corrupt vccz bit, we need to:

        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD

        //    operations to complete.

        // 2. Restore the correct value of vccz by writing the current value

        //    of vcc back to vcc.

        if (ST->hasReadVCCZBug() &&

            ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {

          // Writes to vcc while there's an outstanding smem read may get

          // clobbered as soon as any read completes.

          VCCZCorrect = false;

        } else {

          // Writes to vcc will fix any incorrect value in vccz.

          VCCZCorrect = true;

        }

      }

    }


    if (TII->isSMRD(Inst)) {

      for (const MachineMemOperand *Memop : Inst.memoperands()) {

        // No need to handle invariant loads when avoiding WAR conflicts, as

        // there cannot be a vector store to the same memory location.

        if (!Memop->isInvariant()) {

          const Value *Ptr = Memop->getValue();

          SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));

        }

      }

      if (ST->hasReadVCCZBug()) {

        // This smem read could complete and clobber vccz at any time.

        VCCZCorrect = false;

      }

    }


    updateEventWaitcntAfter(Inst, &ScoreBrackets);


    Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);


    LLVM_DEBUG({

      Inst.print(dbgs());

      ScoreBrackets.dump();

    });


    // TODO: Remove this work-around after fixing the scheduler and enable the

    // assert above.

    if (RestoreVCCZ) {

      // Restore the vccz bit.  Any time a value is written to vcc, the vcc

      // bit is updated, so we can restore the bit by reading the value of

      // vcc and then writing it back to the register.

      BuildMI(Block, Inst, Inst.getDebugLoc(),

              TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),

              TRI->getVCC())

          .addReg(TRI->getVCC());

      VCCZCorrect = true;

      Modified = true;

    }


    ++Iter;

  }


  // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if

  // needed.

  AMDGPU::Waitcnt Wait;

  if (Block.getFirstTerminator() == Block.end() &&

      isPreheaderToFlush(Block, ScoreBrackets)) {

    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))

      Wait.LoadCnt = 0;

    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))

      Wait.SampleCnt = 0;

    if (ScoreBrackets.hasPendingEvent(BVH_CNT))

      Wait.BvhCnt = 0;

  }


  // Combine or remove any redundant waitcnts at the end of the block.

  Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,

                              OldWaitcntInstr);


  LLVM_DEBUG({

    dbgs() << "*** End Block: ";

    Block.printName(dbgs());

    ScoreBrackets.dump();

  });


  return Modified;

}


// Return true if the given machine basic block is a preheader of a loop in

// which we want to flush the vmcnt counter, and false otherwise.

bool SIInsertWaitcnts::isPreheaderToFlush(

    MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {

  auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);

  if (!IsInserted)

    return Iterator->second;


  MachineBasicBlock *Succ = MBB.getSingleSuccessor();

  if (!Succ)

    return false;


  MachineLoop *Loop = MLI->getLoopFor(Succ);

  if (!Loop)

    return false;


  if (Loop->getLoopPreheader() == &MBB &&

      shouldFlushVmCnt(Loop, ScoreBrackets)) {

    Iterator->second = true;

    return true;

  }


  return false;

}


bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {

  if (SIInstrInfo::isFLAT(MI))

    return TII->mayAccessVMEMThroughFlat(MI);

  return SIInstrInfo::isVMEM(MI);

}


// Return true if it is better to flush the vmcnt counter in the preheader of

// the given loop. We currently decide to flush in two situations:

// 1. The loop contains vmem store(s), no vmem load and at least one use of a

//    vgpr containing a value that is loaded outside of the loop. (Only on

//    targets with no vscnt counter).

// 2. The loop contains vmem load(s), but the loaded values are not used in the

//    loop, and at least one use of a vgpr containing a value that is loaded

//    outside of the loop.

bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,

                                        const WaitcntBrackets &Brackets) {

  bool HasVMemLoad = false;

  bool HasVMemStore = false;

  bool UsesVgprLoadedOutside = false;

  DenseSet<MCRegUnit> VgprUse;

  DenseSet<MCRegUnit> VgprDef;


  for (MachineBasicBlock *MBB : ML->blocks()) {

    for (MachineInstr &MI : *MBB) {

      if (isVMEMOrFlatVMEM(MI)) {

        HasVMemLoad |= MI.mayLoad();

        HasVMemStore |= MI.mayStore();

      }


      for (const MachineOperand &Op : MI.all_uses()) {

        if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))

          continue;

        // Vgpr use

        for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {

          // If we find a register that is loaded inside the loop, 1. and 2.

          // are invalidated and we can exit.

          if (VgprDef.contains(RU))

            return false;

          VgprUse.insert(RU);

          // If at least one of Op's registers is in the score brackets, the

          // value is likely loaded outside of the loop.

          VMEMID ID = toVMEMID(RU);

          if (Brackets.getVMemScore(ID, LOAD_CNT) >

                  Brackets.getScoreLB(LOAD_CNT) ||

              Brackets.getVMemScore(ID, SAMPLE_CNT) >

                  Brackets.getScoreLB(SAMPLE_CNT) ||

              Brackets.getVMemScore(ID, BVH_CNT) >

                  Brackets.getScoreLB(BVH_CNT)) {

            UsesVgprLoadedOutside = true;

            break;

          }

        }

      }


      // VMem load vgpr def

      if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {

        for (const MachineOperand &Op : MI.all_defs()) {

          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {

            // If we find a register that is loaded inside the loop, 1. and 2.

            // are invalidated and we can exit.

            if (VgprUse.contains(RU))

              return false;

            VgprDef.insert(RU);

          }

        }

      }

    }

  }

  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)

    return true;

  return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();

}


bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {

  auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();

  auto *PDT =

      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();

  AliasAnalysis *AA = nullptr;

  if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())

    AA = &AAR->getAAResults();


  return SIInsertWaitcnts(MLI, PDT, AA).run(MF);

}


PreservedAnalyses


SIInsertWaitcntsPass::run(MachineFunction &MF,

                          MachineFunctionAnalysisManager &MFAM) {

  auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);

  auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);

  auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)

                 .getManager()

                 .getCachedResult<AAManager>(MF.getFunction());


  if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))

    return PreservedAnalyses::all();


  return getMachineFunctionPassPreservedAnalyses()

      .preserveSet<CFGAnalyses>()

      .preserve<AAManager>();

}


bool SIInsertWaitcnts::run(MachineFunction &MF) {

  ST = &MF.getSubtarget<GCNSubtarget>();

  TII = ST->getInstrInfo();

  TRI = &TII->getRegisterInfo();

  MRI = &MF.getRegInfo();

  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();


  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());


  if (ST->hasExtendedWaitCounts()) {

    MaxCounter = NUM_EXTENDED_INST_CNTS;

    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);

    WCG = &WCGGFX12Plus;

  } else {

    MaxCounter = NUM_NORMAL_INST_CNTS;

    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);

    WCG = &WCGPreGFX12;

  }


  for (auto T : inst_counter_types())

    ForceEmitWaitcnt[T] = false;


  WaitEventMaskForInst = WCG->getWaitEventMask();


  SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);


  if (ST->hasExtendedWaitCounts()) {

    Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);

    Limits.DscntMax = AMDGPU::getDscntBitMask(IV);

  } else {

    Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);

    Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);

  }

  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);

  Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);

  Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);

  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);

  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);

  Limits.XcntMax = AMDGPU::getXcntBitMask(IV);


  BlockInfos.clear();

  bool Modified = false;


  MachineBasicBlock &EntryBB = MF.front();

  MachineBasicBlock::iterator I = EntryBB.begin();


  if (!MFI->isEntryFunction()) {

    // Wait for any outstanding memory operations that the input registers may

    // depend on. We can't track them and it's better to do the wait after the

    // costly call sequence.


    // TODO: Could insert earlier and schedule more liberally with operations

    // that only use caller preserved registers.

    for (MachineBasicBlock::iterator E = EntryBB.end();

         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)

      ;


    if (ST->hasExtendedWaitCounts()) {

      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))

          .addImm(0);

      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)

          continue;


        if (!ST->hasImageInsts() &&

            (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))

          continue;


        BuildMI(EntryBB, I, DebugLoc(),

                TII->get(instrsForExtendedCounterTypes[CT]))

            .addImm(0);

      }

    } else {

      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);

    }


    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);

    NonKernelInitialState->setStateOnFunctionEntryOrReturn();

    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);


    Modified = true;

  }


  // Keep iterating over the blocks in reverse post order, inserting and

  // updating s_waitcnt where needed, until a fix point is reached.

  for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))

    BlockInfos.try_emplace(MBB);


  std::unique_ptr<WaitcntBrackets> Brackets;

  bool Repeat;

  do {

    Repeat = false;


    for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;

         ++BII) {

      MachineBasicBlock *MBB = BII->first;

      BlockInfo &BI = BII->second;

      if (!BI.Dirty)

        continue;


      if (BI.Incoming) {

        if (!Brackets)

          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);

        else

          *Brackets = *BI.Incoming;

      } else {

        if (!Brackets) {

          Brackets = std::make_unique<WaitcntBrackets>(this);

        } else {

          // Reinitialize in-place. N.B. do not do this by assigning from a

          // temporary because the WaitcntBrackets class is large and it could

          // cause this function to use an unreasonable amount of stack space.

          Brackets->~WaitcntBrackets();

          new (Brackets.get()) WaitcntBrackets(this);

        }

      }


      Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);

      BI.Dirty = false;


      if (Brackets->hasPendingEvent()) {

        BlockInfo *MoveBracketsToSucc = nullptr;

        for (MachineBasicBlock *Succ : MBB->successors()) {

          auto *SuccBII = BlockInfos.find(Succ);

          BlockInfo &SuccBI = SuccBII->second;

          if (!SuccBI.Incoming) {

            SuccBI.Dirty = true;

            if (SuccBII <= BII) {

              LLVM_DEBUG(dbgs() << "repeat on backedge\n");

              Repeat = true;

            }

            if (!MoveBracketsToSucc) {

              MoveBracketsToSucc = &SuccBI;

            } else {

              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);

            }

          } else if (SuccBI.Incoming->merge(*Brackets)) {

            SuccBI.Dirty = true;

            if (SuccBII <= BII) {

              LLVM_DEBUG(dbgs() << "repeat on backedge\n");

              Repeat = true;

            }

          }

        }

        if (MoveBracketsToSucc)

          MoveBracketsToSucc->Incoming = std::move(Brackets);

      }

    }

  } while (Repeat);


  if (ST->hasScalarStores()) {

    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;

    bool HaveScalarStores = false;


    for (MachineBasicBlock &MBB : MF) {

      for (MachineInstr &MI : MBB) {

        if (!HaveScalarStores && TII->isScalarStore(MI))

          HaveScalarStores = true;


        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||

            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)

          EndPgmBlocks.push_back(&MBB);

      }

    }


    if (HaveScalarStores) {

      // If scalar writes are used, the cache must be flushed or else the next

      // wave to reuse the same scratch memory can be clobbered.

      //

      // Insert s_dcache_wb at wave termination points if there were any scalar

      // stores, and only if the cache hasn't already been flushed. This could

      // be improved by looking across blocks for flushes in postdominating

      // blocks from the stores but an explicitly requested flush is probably

      // very rare.

      for (MachineBasicBlock *MBB : EndPgmBlocks) {

        bool SeenDCacheWB = false;


        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();

             I != E; ++I) {

          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)

            SeenDCacheWB = true;

          else if (TII->isScalarStore(*I))

            SeenDCacheWB = false;


          // FIXME: It would be better to insert this before a waitcnt if any.

          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||

               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&

              !SeenDCacheWB) {

            Modified = true;

            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));

          }

        }

      }

    }

  }


  // Deallocate the VGPRs before previously identified S_ENDPGM instructions.

  // This is done in different ways depending on how the VGPRs were allocated

  // (i.e. whether we're in dynamic VGPR mode or not).

  // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short

  // waveslot limited kernel runs slower with the deallocation.

  if (MFI->isDynamicVGPREnabled()) {

    for (MachineInstr *MI : ReleaseVGPRInsts) {

      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

              TII->get(AMDGPU::S_ALLOC_VGPR))

          .addImm(0);

      Modified = true;

    }

  } else {

    if (!ReleaseVGPRInsts.empty() &&

        (MF.getFrameInfo().hasCalls() ||

         ST->getOccupancyWithNumVGPRs(

             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),

             /*IsDynamicVGPR=*/false) <

             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {

      for (MachineInstr *MI : ReleaseVGPRInsts) {

        if (ST->requiresNopBeforeDeallocVGPRs()) {

          BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

                  TII->get(AMDGPU::S_NOP))

              .addImm(0);

        }

        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

                TII->get(AMDGPU::S_SENDMSG))

            .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);

        Modified = true;

      }

    }

  }

  ReleaseVGPRInsts.clear();

  PreheadersToFlush.clear();

  SLoadAddresses.clear();


  return Modified;

}

MRI
unsigned const MachineRegisterInfo * MRI
Definition AArch64AdvSIMDScalarPass.cpp:103

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUBaseInfo.h

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AliasAnalysis.h

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

Info
Analysis containing CSE Info
Definition CSEInfo.cpp:27

DebugCounter.h
This file provides an implementation of debug counters.

DEBUG_COUNTER
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition DebugCounter.h:185

Dominators.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

isOptNone
static bool isOptNone(const MachineFunction &MF)
Definition HexagonFrameLowering.cpp:371

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InlinePriorityMode::ML
@ ML
Definition InlineOrder.cpp:25

LoopDeletionResult::Modified
@ Modified
Definition LoopDeletion.cpp:47

merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition LoopDeletion.cpp:51

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineFrameInfo.h

MachineLoopInfo.h

MachinePassManager.h

MachinePostDominators.h

Reg
Register Reg
Definition MachineSink.cpp:2117

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

MapVector.h
This file implements a map that provides insertion order iteration.

Context
@ Context
Definition MemProfContextDisambiguation.cpp:133

T
#define T
Definition Mips16ISelLowering.cpp:282

isReg
static bool isReg(const MCInst &MI, unsigned OpNo)
Definition MipsInstPrinter.cpp:32

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:75

ForceEmitZeroLoadFlag
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)

callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition SIInsertWaitcnts.cpp:1908

AMDGPU_EVENT_NAME
#define AMDGPU_EVENT_NAME(Name)
Definition SIInsertWaitcnts.cpp:175

callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition SIInsertWaitcnts.cpp:1898

updateOperandIfDifferent
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
Definition SIInsertWaitcnts.cpp:1399

isWaitInstr
static bool isWaitInstr(MachineInstr &Inst)
Definition SIInsertWaitcnts.cpp:2500

counterTypeForInstr
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Definition SIInsertWaitcnts.cpp:1415

ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)

AMDGPU_DECLARE_WAIT_EVENTS
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
Definition SIInsertWaitcnts.cpp:146

AMDGPU_EVENT_ENUM
#define AMDGPU_EVENT_ENUM(Name)
Definition SIInsertWaitcnts.cpp:168

SIMachineFunctionInfo.h

Sequence.h
Provides some synthesis utilities to produce sequences of values.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

TargetParser.h

getFunction
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Definition WebAssemblyLowerEmscriptenEHSjLj.cpp:442

IV
static const uint32_t IV[8]
Definition blake3_impl.h:83

llvm::AAManager
A manager for alias analyses.
Definition AliasAnalysis.h:974

llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition AMDGPUMachineFunction.h:97

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:411

llvm::AnalysisUsage::addUsedIfAvailable
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Definition PassAnalysisSupport.h:118

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition PassAnalysisSupport.h:99

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::DebugCounter::shouldExecute
static bool shouldExecute(CounterInfo &Counter)
Definition DebugCounter.h:111

llvm::DebugCounter::isCounterSet
static bool isCounterSet(CounterInfo &Info)
Definition DebugCounter.h:120

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256

llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition DenseMap.h:330

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241

llvm::DenseMapBase::clear
void clear()
Definition DenseMap.h:121

llvm::DominatorTreeBase::dominates
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Definition GenericDomTree.h:466

llvm::FunctionAnalysisManagerMachineFunctionProxy
Definition MachinePassManager.h:130

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition GenericLoopInfoImpl.h:210

llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition GenericLoopInfo.h:606

llvm::MachineBasicBlock::getSingleSuccessor
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition MachineBasicBlock.cpp:987

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:377

llvm::MachineBasicBlock::findDebugLoc
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Definition MachineBasicBlock.cpp:1552

llvm::MachineBasicBlock::instr_iterator
Instructions::iterator instr_iterator
Definition MachineBasicBlock.h:336

llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition MachineBasicBlock.h:363

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:379

llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition MachineBasicBlock.h:466

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition MachineFunctionPass.h:31

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachineFunctionPass.cpp:184

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:772

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:860

llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition MachineFunction.h:996

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:175

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:141

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineInstr::defs
mop_range defs()
Returns all explicit operands that are register definitions.
Definition MachineInstr.h:735

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:598

llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition MachineInstr.h:1178

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:370

llvm::MachineInstr::all_defs
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition MachineInstr.h:765

llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition MachineInstr.h:967

llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition MachineInstr.h:1155

llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
Definition MachineInstr.h:1540

llvm::MachineInstr::setDesc
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition MachineInstr.cpp:145

llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition MachineInstr.h:791

llvm::MachineInstr::print
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition MachineInstr.cpp:1791

llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition MachineInstr.h:1168

llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition MachineInstr.h:522

llvm::MachineInstr::eraseFromParent
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition MachineInstr.cpp:787

llvm::MachineInstr::all_uses
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
Definition MachineInstr.h:775

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:606

llvm::MachineInstr::isMetaInstruction
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Definition MachineInstr.h:953

llvm::MachineLoopAnalysis
Analysis pass that exposes the MachineLoopInfo for a machine function.
Definition MachineLoopInfo.h:140

llvm::MachineLoopInfoWrapperPass
Definition MachineLoopInfo.h:161

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:694

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachinePostDominatorTreeAnalysis
Definition MachinePostDominators.h:71

llvm::MachinePostDominatorTreeWrapperPass
Definition MachinePostDominators.h:95

llvm::MapVector::end
iterator end()
Definition MapVector.h:67

llvm::MapVector::find
iterator find(const KeyT &Key)
Definition MapVector.h:154

llvm::MapVector::begin
iterator begin()
Definition MapVector.h:65

llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116

llvm::MapVector::clear
void clear()
Definition MapVector.h:88

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::SIInsertWaitcntsPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SIInsertWaitcnts.cpp:2773

llvm::SIInstrInfo::isCBranchVCCZRead
static bool isCBranchVCCZRead(const MachineInstr &MI)
Definition SIInstrInfo.h:821

llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition SIInstrInfo.h:476

llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition SIInstrInfo.h:684

llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition SIInstrInfo.h:769

llvm::SIInstrInfo::mayWriteLDSThroughDMA
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition SIInstrInfo.h:811

llvm::SIInstrInfo::isLDSDIR
static bool isLDSDIR(const MachineInstr &MI)
Definition SIInstrInfo.h:982

llvm::SIInstrInfo::isGWS
static bool isGWS(const MachineInstr &MI)
Definition SIInstrInfo.h:618

llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition SIInstrInfo.h:676

llvm::SIInstrInfo::isVSAMPLE
static bool isVSAMPLE(const MachineInstr &MI)
Definition SIInstrInfo.h:644

llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition SIInstrInfo.h:793

llvm::SIInstrInfo::isImage
static bool isImage(const MachineInstr &MI)
Definition SIInstrInfo.h:468

llvm::SIInstrInfo::getNonSoftWaitcntOpcode
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition SIInstrInfo.h:1132

llvm::SIInstrInfo::isVINTERP
static bool isVINTERP(const MachineInstr &MI)
Definition SIInstrInfo.h:990

llvm::SIInstrInfo::isGFX12CacheInvOrWBInst
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
Definition SIInstrInfo.h:1098

llvm::SIInstrInfo::isSBarrierSCCWrite
static bool isSBarrierSCCWrite(unsigned Opcode)
Definition SIInstrInfo.h:815

llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition SIInstrInfo.h:628

llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition SIInstrInfo.h:660

llvm::SIInstrInfo::isLDSDMA
static bool isLDSDMA(const MachineInstr &MI)
Definition SIInstrInfo.h:610

llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition SIInstrInfo.h:785

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:417

llvm::SIMachineFunctionInfo::isDynamicVGPREnabled
bool isDynamicVGPREnabled() const
Definition SIMachineFunctionInfo.h:852

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:417

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:80

llvm::StringLiteral
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854

llvm::cl::opt
Definition CommandLine.h:1455

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

uint32_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

OpName
Definition R600Defines.h:62

false
Definition MachinePipeliner.cpp:244

llvm::AA
Abstract Attribute helper functions.
Definition Attributor.h:165

llvm::AArch64CC::LS
@ LS
Definition AArch64BaseInfo.h:264

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:30

llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition SIDefines.h:1025

llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition SIDefines.h:1017

llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition SIDefines.h:1020

llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition SIDefines.h:1024

llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition AMDGPUBaseInfo.cpp:1217

llvm::AMDGPU::SendMsg::ID_MASK_PreGFX11_
@ ID_MASK_PreGFX11_
Definition SIDefines.h:466

llvm::AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus
@ ID_DEALLOC_VGPRS_GFX11Plus
Definition SIDefines.h:442

llvm::AMDGPU::SendMsg::ID_GS_DONE_PreGFX11
@ ID_GS_DONE_PreGFX11
Definition SIDefines.h:439

llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition AMDGPUBaseInfo.cpp:1811

llvm::AMDGPU::getStorecntBitMask
unsigned getStorecntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1777

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:133

llvm::AMDGPU::getIsaVersion
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Definition TargetParser.cpp:276

llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition AMDGPUBaseInfo.cpp:1847

llvm::AMDGPU::getSamplecntBitMask
unsigned getSamplecntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1749

llvm::AMDGPU::getKmcntBitMask
unsigned getKmcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1769

llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1739

llvm::AMDGPU::getXcntBitMask
unsigned getXcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1773

llvm::AMDGPU::decodeStorecntDscnt
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Definition AMDGPUBaseInfo.cpp:1884

llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1761

llvm::AMDGPU::getBvhcntBitMask
unsigned getBvhcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1753

llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1757

llvm::AMDGPU::decodeLoadcntDscnt
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
Definition AMDGPUBaseInfo.cpp:1874

llvm::AMDGPU::encodeStorecntDscnt
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
Definition AMDGPUBaseInfo.cpp:1924

llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:527

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::getLoadcntBitMask
unsigned getLoadcntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1745

llvm::AMDGPU::encodeLoadcntDscnt
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
Definition AMDGPUBaseInfo.cpp:1912

llvm::AMDGPU::getDscntBitMask
unsigned getDscntBitMask(const IsaVersion &Version)
Definition AMDGPUBaseInfo.cpp:1765

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::u
@ u
Definition M68kBaseInfo.h:60

llvm::M68k::MemAddrModeKind::K
@ K
Definition M68kBaseInfo.h:68

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:55

llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition SIDefines.h:105

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition PointerTypeAnalysis.cpp:205

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::pdb::PDB_LocType::Slot
@ Slot
Definition PDBTypes.h:300

llvm::sandboxir::empty
bool empty() const
Definition BasicBlock.h:101

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition SparseBitVector.h:874

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737

llvm::PseudoProbeType::Block
@ Block
Definition PseudoProbe.h:30

llvm::print
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
Definition GCNRegPressure.cpp:238

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:391

llvm::enum_seq
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337

llvm::Wait
@ Wait
Definition Threading.h:60

llvm::getCPU
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Definition AVRTargetMachine.cpp:32

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::getMachineFunctionPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition MachinePassManager.cpp:162

llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition SIInsertWaitcnts.cpp:1393

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634

llvm::None
@ None
Definition CodeGenData.h:107

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1129

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:904

llvm::iterator_range
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition SIInsertWaitcnts.cpp:1395

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition TargetParser.h:133

llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition AMDGPUBaseInfo.h:421

llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition AMDGPUBaseInfo.h:426

llvm::AMDGPU::MIMGBaseOpcodeInfo::MSAA
bool MSAA
Definition AMDGPUBaseInfo.h:435

llvm::AMDGPU::MIMGBaseOpcodeInfo::PointSampleAccel
bool PointSampleAccel
Definition AMDGPUBaseInfo.h:439

llvm::AMDGPU::MIMGBaseOpcodeInfo::BVH
bool BVH
Definition AMDGPUBaseInfo.h:436

llvm::AMDGPU::MIMGInfo
Definition AMDGPUBaseInfo.h:524

llvm::AMDGPU::Waitcnt
Represents the counter values to wait for in an s_waitcnt instruction.
Definition AMDGPUBaseInfo.h:1083

llvm::AMDGPU::Waitcnt::XCnt
unsigned XCnt
Definition AMDGPUBaseInfo.h:1091

llvm::AMDGPU::Waitcnt::LoadCnt
unsigned LoadCnt
Definition AMDGPUBaseInfo.h:1084

llvm::cl::desc
Definition CommandLine.h:411

llvm::enum_iteration_traits< InstCounterType >::is_iterable
static constexpr bool is_iterable
Definition SIInsertWaitcnts.cpp:88

llvm::enum_iteration_traits
Definition Sequence.h:100