docs/doxygen/SIInsertWaitcnts_8cpp_source.html

//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Insert wait instructions for memory reads and writes.

///

/// Memory reads and writes are issued asynchronously, so we need to insert

/// S_WAITCNT instructions when we want to access any of their results or

/// overwrite any register that's used asynchronously.

///

/// TODO: This pass currently keeps one timeline per hardware counter. A more

/// finely-grained approach that keeps one timeline per event type could

/// sometimes get away with generating weaker s_waitcnt instructions. For

/// example, when both SMEM and LDS are in flight and we need to wait for

/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,

/// but the pass will currently generate a conservative lgkmcnt(0) because

/// multiple event types are in flight.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/Sequence.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/CodeGen/MachinePassManager.h"

#include "llvm/CodeGen/MachinePostDominators.h"

#include "llvm/IR/Dominators.h"

#include "llvm/InitializePasses.h"

#include "llvm/Support/DebugCounter.h"

#include "llvm/TargetParser/TargetParser.h"


using namespace llvm;

using namespace llvm::AMDGPU;


#define DEBUG_TYPE "si-insert-waitcnts"


DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",

              "Force emit s_waitcnt expcnt(0) instrs");

DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",

              "Force emit s_waitcnt lgkmcnt(0) instrs");

DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",

              "Force emit s_waitcnt vmcnt(0) instrs");


static cl::opt<bool>

    ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",

                      cl::desc("Force all waitcnt instrs to be emitted as "

                               "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),

                      cl::init(false), cl::Hidden);


static cl::opt<bool> ForceEmitZeroLoadFlag(

    "amdgpu-waitcnt-load-forcezero",

    cl::desc("Force all waitcnt load counters to wait until 0"),

    cl::init(false), cl::Hidden);


static cl::opt<bool> ExpertSchedulingModeFlag(

    "amdgpu-expert-scheduling-mode",

    cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),

    cl::init(false), cl::Hidden);


namespace {

// Get the maximum wait count value for a given counter type.

static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,

                                InstCounterType T) {

  switch (T) {

  case LOAD_CNT:

    return Limits.LoadcntMax;

  case DS_CNT:

    return Limits.DscntMax;

  case EXP_CNT:

    return Limits.ExpcntMax;

  case STORE_CNT:

    return Limits.StorecntMax;

  case SAMPLE_CNT:

    return Limits.SamplecntMax;

  case BVH_CNT:

    return Limits.BvhcntMax;

  case KM_CNT:

    return Limits.KmcntMax;

  case X_CNT:

    return Limits.XcntMax;

  case VA_VDST:

    return Limits.VaVdstMax;

  case VM_VSRC:

    return Limits.VmVsrcMax;

  default:

    return 0;

  }

}


/// Integer IDs used to track vector memory locations we may have to wait on.

/// Encoded as u16 chunks:

///

///   [0,               REGUNITS_END ): MCRegUnit

///   [LDSDMA_BEGIN,    LDSDMA_END  ) : LDS DMA IDs

///

/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.

/// It gives (2 << 16) - 1 entries per category which is more than enough

/// for all register units. MCPhysReg is u16 so we don't even support >u16

/// physical register numbers at this time, let alone >u16 register units.

/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END

/// is enough for all register units.

using VMEMID = uint32_t;


enum : VMEMID {

  TRACKINGID_RANGE_LEN = (1 << 16),


  // Important: MCRegUnits must always be tracked starting from 0, as we

  // need to be able to convert between a MCRegUnit and a VMEMID freely.

  REGUNITS_BEGIN = 0,

  REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,


  // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"

  // entry, which is updated for all LDS DMA operations encountered.

  // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.

  NUM_LDSDMA = TRACKINGID_RANGE_LEN,

  LDSDMA_BEGIN = REGUNITS_END,

  LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,

};


/// Convert a MCRegUnit to a VMEMID.

static constexpr VMEMID toVMEMID(MCRegUnit RU) {

  return static_cast<unsigned>(RU);

}


#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \

  DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */    \

  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \

  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \

  DECL(GLOBAL_INV_ACCESS)        /* GLOBAL_INV (gfx12+ only) */                \

  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \

  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \

  DECL(VMEM_GROUP)               /* vmem group */                              \

  DECL(LDS_ACCESS)               /* lds read & write */                        \

  DECL(GDS_ACCESS)               /* gds read & write */                        \

  DECL(SQ_MESSAGE)               /* send message */                            \

  DECL(SCC_WRITE)                /* write to SCC from barrier */               \

  DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \

  DECL(SMEM_GROUP)               /* scalar-memory group */                     \

  DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \

  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \

  DECL(EXP_POS_ACCESS)           /* write to export position */                \

  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \

  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \

  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */       \

  DECL(VGPR_CSMACC_WRITE)        /* write VGPR dest in Core/Side-MACC VALU */  \

  DECL(VGPR_DPMACC_WRITE)        /* write VGPR dest in DPMACC VALU */          \

  DECL(VGPR_TRANS_WRITE)         /* write VGPR dest in TRANS VALU */           \

  DECL(VGPR_XDL_WRITE)           /* write VGPR dest in XDL VALU */             \

  DECL(VGPR_LDS_READ)            /* read VGPR source in LDS */                 \

  DECL(VGPR_FLAT_READ)           /* read VGPR source in FLAT */                \

  DECL(VGPR_VMEM_READ)           /* read VGPR source in other VMEM */


// clang-format off

#define AMDGPU_EVENT_ENUM(Name) Name,

enum WaitEventType {

  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)

  NUM_WAIT_EVENTS

};

#undef AMDGPU_EVENT_ENUM

} // namespace


namespace llvm {


template <> struct enum_iteration_traits<WaitEventType> {

  static constexpr bool is_iterable = true;

};


} // namespace llvm


namespace {


/// Return an iterator over all events between VMEM_ACCESS (the first event)

/// and \c MaxEvent (exclusive, default value yields an enumeration over

/// all counters).

auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {

  return enum_seq(VMEM_ACCESS, MaxEvent);

}


#define AMDGPU_EVENT_NAME(Name) #Name,

static constexpr StringLiteral WaitEventTypeName[] = {

  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)

};

#undef AMDGPU_EVENT_NAME

static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {

  return WaitEventTypeName[Event];

}

// clang-format on


// Enumerate different types of result-returning VMEM operations. Although

// s_waitcnt orders them all with a single vmcnt counter, in the absence of

// s_waitcnt only instructions of the same VmemType are guaranteed to write

// their results in order -- so there is no need to insert an s_waitcnt between

// two instructions of the same type that write the same vgpr.

enum VmemType {

  // BUF instructions and MIMG instructions without a sampler.

  VMEM_NOSAMPLER,

  // MIMG instructions with a sampler.

  VMEM_SAMPLER,

  // BVH instructions

  VMEM_BVH,

  NUM_VMEM_TYPES

};


// Maps values of InstCounterType to the instruction that waits on that

// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()

// returns true, and does not cover VA_VDST or VM_VSRC.

static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {

    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,

    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,

    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};


static bool updateVMCntOnly(const MachineInstr &Inst) {

  return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||

         SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst);

}


#ifndef NDEBUG

static bool isNormalMode(InstCounterType MaxCounter) {

  return MaxCounter == NUM_NORMAL_INST_CNTS;

}

#endif // NDEBUG


VmemType getVmemType(const MachineInstr &Inst) {

  assert(updateVMCntOnly(Inst));

  if (!SIInstrInfo::isImage(Inst))

    return VMEM_NOSAMPLER;

  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());

  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =

      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);


  if (BaseInfo->BVH)

    return VMEM_BVH;


  // We have to make an additional check for isVSAMPLE here since some

  // instructions don't have a sampler, but are still classified as sampler

  // instructions for the purposes of e.g. waitcnt.

  if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))

    return VMEM_SAMPLER;


  return VMEM_NOSAMPLER;

}


void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {

  Wait.set(T, std::min(Wait.get(T), Count));

}


void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }


/// A small set of events.

class WaitEventSet {

  unsigned Mask = 0;


public:

  WaitEventSet() = default;

  explicit constexpr WaitEventSet(WaitEventType Event) {

    static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,

                  "Not enough bits in Mask for all the events");

    Mask |= 1 << Event;

  }

  constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {

    for (auto &E : Events) {

      Mask |= 1 << E;

    }

  }

  void insert(const WaitEventType &Event) { Mask |= 1 << Event; }

  void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }

  void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }

  bool contains(const WaitEventType &Event) const {

    return Mask & (1 << Event);

  }

  /// \Returns true if this set contains all elements of \p Other.

  bool contains(const WaitEventSet &Other) const {

    return (~Mask & Other.Mask) == 0;

  }

  /// \Returns the intersection of this and \p Other.

  WaitEventSet operator&(const WaitEventSet &Other) const {

    auto Copy = *this;

    Copy.Mask &= Other.Mask;

    return Copy;

  }

  /// \Returns the union of this and \p Other.

  WaitEventSet operator|(const WaitEventSet &Other) const {

    auto Copy = *this;

    Copy.Mask |= Other.Mask;

    return Copy;

  }

  /// This set becomes the union of this and \p Other.

  WaitEventSet &operator|=(const WaitEventSet &Other) {

    Mask |= Other.Mask;

    return *this;

  }

  /// This set becomes the intersection of this and \p Other.

  WaitEventSet &operator&=(const WaitEventSet &Other) {

    Mask &= Other.Mask;

    return *this;

  }

  bool operator==(const WaitEventSet &Other) const {

    return Mask == Other.Mask;

  }

  bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }

  bool empty() const { return Mask == 0; }

  /// \Returns true if the set contains more than one element.

  bool twoOrMore() const { return Mask & (Mask - 1); }

  operator bool() const { return !empty(); }

  void print(raw_ostream &OS) const {

    ListSeparator LS(", ");

    for (WaitEventType Event : wait_events()) {

      if (contains(Event))

        OS << LS << getWaitEventTypeName(Event);

    }

  }

  LLVM_DUMP_METHOD void dump() const;

};


void WaitEventSet::dump() const {

  print(dbgs());

  dbgs() << "\n";

}


class WaitcntBrackets;


// This abstracts the logic for generating and updating S_WAIT* instructions

// away from the analysis that determines where they are needed. This was

// done because the set of counters and instructions for waiting on them

// underwent a major shift with gfx12, sufficiently so that having this

// abstraction allows the main analysis logic to be simpler than it would

// otherwise have had to become.

class WaitcntGenerator {

protected:

  const GCNSubtarget &ST;

  const SIInstrInfo &TII;

  AMDGPU::IsaVersion IV;

  InstCounterType MaxCounter;

  bool OptNone;

  bool ExpandWaitcntProfiling = false;

  const AMDGPU::HardwareLimits &Limits;


public:

  WaitcntGenerator() = delete;

  WaitcntGenerator(const WaitcntGenerator &) = delete;

  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,

                   const AMDGPU::HardwareLimits &Limits)

      : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),

        IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),

        OptNone(MF.getFunction().hasOptNone() ||

                MF.getTarget().getOptLevel() == CodeGenOptLevel::None),

        ExpandWaitcntProfiling(

            MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),

        Limits(Limits) {}


  // Return true if the current function should be compiled with no

  // optimization.

  bool isOptNone() const { return OptNone; }


  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }


  // Edits an existing sequence of wait count instructions according

  // to an incoming Waitcnt value, which is itself updated to reflect

  // any new wait count instructions which may need to be generated by

  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits

  // were made.

  //

  // This editing will usually be merely updated operands, but it may also

  // delete instructions if the incoming Wait value indicates they are not

  // needed. It may also remove existing instructions for which a wait

  // is needed if it can be determined that it is better to generate new

  // instructions later, as can happen on gfx12.

  virtual bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const = 0;


  // Transform a soft waitcnt into a normal one.

  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;


  // Generates new wait count instructions according to the value of

  // Wait, returning true if any new instructions were created.

  // ScoreBrackets is used for profiling expansion.

  virtual bool createNewWaitcnt(MachineBasicBlock &Block,

                                MachineBasicBlock::instr_iterator It,

                                AMDGPU::Waitcnt Wait,

                                const WaitcntBrackets &ScoreBrackets) = 0;


  // Returns the WaitEventSet that corresponds to counter \p T.

  virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;


  /// \returns the counter that corresponds to event \p E.

  InstCounterType getCounterFromEvent(WaitEventType E) const {

    for (auto T : inst_counter_types()) {

      if (getWaitEvents(T).contains(E))

        return T;

    }

    llvm_unreachable("event type has no associated counter");

  }


  // Returns a new waitcnt with all counters except VScnt set to 0. If

  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.

  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;


  virtual ~WaitcntGenerator() = default;

};


class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {

  static constexpr const WaitEventSet

      WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {

          WaitEventSet(

              {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),

          WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),

          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,

                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),

          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

          WaitEventSet(),

          WaitEventSet(),

          WaitEventSet(),

          WaitEventSet(),

          WaitEventSet(),

          WaitEventSet()};


public:

  using WaitcntGenerator::WaitcntGenerator;

  bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const override;


  bool createNewWaitcnt(MachineBasicBlock &Block,

                        MachineBasicBlock::instr_iterator It,

                        AMDGPU::Waitcnt Wait,

                        const WaitcntBrackets &ScoreBrackets) override;


  const WaitEventSet &getWaitEvents(InstCounterType T) const override {

    return WaitEventMaskForInstPreGFX12[T];

  }


  AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

};


class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {

protected:

  bool IsExpertMode;

  static constexpr const WaitEventSet

      WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {

          WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),

          WaitEventSet({LDS_ACCESS, GDS_ACCESS}),

          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,

                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),

          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

          WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),

          WaitEventSet({VMEM_BVH_READ_ACCESS}),

          WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),

          WaitEventSet({VMEM_GROUP, SMEM_GROUP}),

          WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,

                        VGPR_XDL_WRITE}),

          WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};


public:

  WaitcntGeneratorGFX12Plus() = delete;

  WaitcntGeneratorGFX12Plus(const MachineFunction &MF,

                            InstCounterType MaxCounter,

                            const AMDGPU::HardwareLimits &Limits,

                            bool IsExpertMode)

      : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}


  bool

  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

                          MachineBasicBlock::instr_iterator It) const override;


  bool createNewWaitcnt(MachineBasicBlock &Block,

                        MachineBasicBlock::instr_iterator It,

                        AMDGPU::Waitcnt Wait,

                        const WaitcntBrackets &ScoreBrackets) override;


  const WaitEventSet &getWaitEvents(InstCounterType T) const override {

    return WaitEventMaskForInstGFX12Plus[T];

  }


  AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

};


// Flags indicating which counters should be flushed in a loop preheader.

struct PreheaderFlushFlags {

  bool FlushVmCnt = false;

  bool FlushDsCnt = false;

};


class SIInsertWaitcnts {

  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;

  DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;

  MachineLoopInfo &MLI;

  MachinePostDominatorTree &PDT;

  AliasAnalysis *AA = nullptr;

  MachineFunction &MF;


  struct BlockInfo {

    std::unique_ptr<WaitcntBrackets> Incoming;

    bool Dirty = true;

  };


  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;


  bool ForceEmitWaitcnt[NUM_INST_CNTS] = {};


  std::unique_ptr<WaitcntGenerator> WCG;


  // Remember call and return instructions in the function.

  DenseSet<MachineInstr *> CallInsts;

  DenseSet<MachineInstr *> ReturnInsts;


  // Remember all S_ENDPGM instructions. The boolean flag is true if there might

  // be outstanding stores but definitely no outstanding scratch stores, to help

  // with insertion of DEALLOC_VGPRS messages.

  DenseMap<MachineInstr *, bool> EndPgmInsts;


  AMDGPU::HardwareLimits Limits;


public:

  const GCNSubtarget &ST;

  const SIInstrInfo &TII;

  const SIRegisterInfo &TRI;

  const MachineRegisterInfo &MRI;

  InstCounterType SmemAccessCounter;

  InstCounterType MaxCounter;

  bool IsExpertMode = false;


  SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,

                   AliasAnalysis *AA, MachineFunction &MF)

      : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),

        TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),

        MRI(MF.getRegInfo()) {

    (void)ForceExpCounter;

    (void)ForceLgkmCounter;

    (void)ForceVMCounter;

  }


  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }


  PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,

                                             const WaitcntBrackets &Brackets);

  PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,

                                         const WaitcntBrackets &ScoreBrackets);

  bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;

  bool isDSRead(const MachineInstr &MI) const;

  bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;

  bool run();


  void setForceEmitWaitcnt() {

// For non-debug builds, ForceEmitWaitcnt has been initialized to false;

// For debug builds, get the debug counter info and adjust if need be

#ifndef NDEBUG

    if (DebugCounter::isCounterSet(ForceExpCounter) &&

        DebugCounter::shouldExecute(ForceExpCounter)) {

      ForceEmitWaitcnt[EXP_CNT] = true;

    } else {

      ForceEmitWaitcnt[EXP_CNT] = false;

    }


    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&

        DebugCounter::shouldExecute(ForceLgkmCounter)) {

      ForceEmitWaitcnt[DS_CNT] = true;

      ForceEmitWaitcnt[KM_CNT] = true;

    } else {

      ForceEmitWaitcnt[DS_CNT] = false;

      ForceEmitWaitcnt[KM_CNT] = false;

    }


    if (DebugCounter::isCounterSet(ForceVMCounter) &&

        DebugCounter::shouldExecute(ForceVMCounter)) {

      ForceEmitWaitcnt[LOAD_CNT] = true;

      ForceEmitWaitcnt[SAMPLE_CNT] = true;

      ForceEmitWaitcnt[BVH_CNT] = true;

    } else {

      ForceEmitWaitcnt[LOAD_CNT] = false;

      ForceEmitWaitcnt[SAMPLE_CNT] = false;

      ForceEmitWaitcnt[BVH_CNT] = false;

    }


    ForceEmitWaitcnt[VA_VDST] = false;

    ForceEmitWaitcnt[VM_VSRC] = false;

#endif // NDEBUG

  }


  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM

  // instruction.

  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {

    switch (Inst.getOpcode()) {

    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.

    case AMDGPU::GLOBAL_INV:

      return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write

                                // VGPRs

    case AMDGPU::GLOBAL_WB:

    case AMDGPU::GLOBAL_WBINV:

      return VMEM_WRITE_ACCESS; // tracked using storecnt

    default:

      break;

    }


    // Maps VMEM access types to their corresponding WaitEventType.

    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {

        VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};


    assert(SIInstrInfo::isVMEM(Inst));

    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side

    // these should use VM_CNT.

    if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))

      return VMEM_ACCESS;

    if (Inst.mayStore() &&

        (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {

      if (TII.mayAccessScratch(Inst))

        return SCRATCH_WRITE_ACCESS;

      return VMEM_WRITE_ACCESS;

    }

    if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))

      return VMEM_ACCESS;

    return VmemReadMapping[getVmemType(Inst)];

  }


  std::optional<WaitEventType>

  getExpertSchedulingEventType(const MachineInstr &Inst) const;


  bool isAsync(const MachineInstr &MI) const {

    if (!SIInstrInfo::isLDSDMA(MI))

      return false;

    if (SIInstrInfo::usesASYNC_CNT(MI))

      return true;

    const MachineOperand *Async =

        TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);

    return Async && (Async->getImm());

  }


  bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {

    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);

  }


  bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {

    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);

  }


  bool isVmemAccess(const MachineInstr &MI) const;

  bool generateWaitcntInstBefore(MachineInstr &MI,

                                 WaitcntBrackets &ScoreBrackets,

                                 MachineInstr *OldWaitcntInstr,

                                 PreheaderFlushFlags FlushFlags);

  bool generateWaitcnt(AMDGPU::Waitcnt Wait,

                       MachineBasicBlock::instr_iterator It,

                       MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,

                       MachineInstr *OldWaitcntInstr);

  /// \returns all events that correspond to \p Inst.

  WaitEventSet getEventsFor(const MachineInstr &Inst) const;

  void updateEventWaitcntAfter(MachineInstr &Inst,

                               WaitcntBrackets *ScoreBrackets);

  bool isNextENDPGM(MachineBasicBlock::instr_iterator It,

                    MachineBasicBlock *Block) const;

  bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,

                             WaitcntBrackets &ScoreBrackets);

  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,

                            WaitcntBrackets &ScoreBrackets);

  /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory

  /// Legalizer. Returns true if block was modified.

  bool removeRedundantSoftXcnts(MachineBasicBlock &Block);

  void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,

                         bool ExpertMode) const;

  const WaitEventSet &getWaitEvents(InstCounterType T) const {

    return WCG->getWaitEvents(T);

  }

  InstCounterType getCounterFromEvent(WaitEventType E) const {

    return WCG->getCounterFromEvent(E);

  }

};


// This objects maintains the current score brackets of each wait counter, and

// a per-register scoreboard for each wait counter.

//

// We also maintain the latest score for every event type that can change the

// waitcnt in order to know if there are multiple types of events within

// the brackets. When multiple types of event happen in the bracket,

// wait count may get decreased out of order, therefore we need to put in

// "s_waitcnt 0" before use.

class WaitcntBrackets {

public:

  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {

    assert(Context->TRI.getNumRegUnits() < REGUNITS_END);

  }


#ifndef NDEBUG

  ~WaitcntBrackets() {

    unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;

    for (auto &[ID, Val] : VMem) {

      if (Val.empty())

        ++NumUnusedVmem;

    }

    for (auto &[ID, Val] : SGPRs) {

      if (Val.empty())

        ++NumUnusedSGPRs;

    }


    if (NumUnusedVmem || NumUnusedSGPRs) {

      errs() << "WaitcntBracket had unused entries at destruction time: "

             << NumUnusedVmem << " VMem and " << NumUnusedSGPRs

             << " SGPR unused entries\n";

      std::abort();

    }

  }

#endif


  bool isSmemCounter(InstCounterType T) const {

    return T == Context->SmemAccessCounter || T == X_CNT;

  }


  unsigned getSgprScoresIdx(InstCounterType T) const {

    assert(isSmemCounter(T) && "Invalid SMEM counter");

    return T == X_CNT ? 1 : 0;

  }


  unsigned getOutstanding(InstCounterType T) const {

    return ScoreUBs[T] - ScoreLBs[T];

  }


  bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {

    return getVMemScore(ID, T) > getScoreLB(T);

  }


  /// \Return true if we have no score entries for counter \p T.

  bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }


private:

  unsigned getScoreLB(InstCounterType T) const {

    assert(T < NUM_INST_CNTS);

    return ScoreLBs[T];

  }


  unsigned getScoreUB(InstCounterType T) const {

    assert(T < NUM_INST_CNTS);

    return ScoreUBs[T];

  }


  unsigned getScoreRange(InstCounterType T) const {

    return getScoreUB(T) - getScoreLB(T);

  }


  unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {

    auto It = SGPRs.find(RU);

    return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;

  }


  unsigned getVMemScore(VMEMID TID, InstCounterType T) const {

    auto It = VMem.find(TID);

    return It != VMem.end() ? It->second.Scores[T] : 0;

  }


public:

  bool merge(const WaitcntBrackets &Other);


  bool counterOutOfOrder(InstCounterType T) const;

  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {

    simplifyWaitcnt(Wait, Wait);

  }

  void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,

                       AMDGPU::Waitcnt &UpdateWait) const;

  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;

  void simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const;

  void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,

                    AMDGPU::Waitcnt &UpdateWait) const;

  void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,

                      AMDGPU::Waitcnt &UpdateWait) const;


  void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,

                               AMDGPU::Waitcnt &Wait) const;

  void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,

                              AMDGPU::Waitcnt &Wait) const;

  AMDGPU::Waitcnt determineAsyncWait(unsigned N);

  void tryClearSCCWriteEvent(MachineInstr *Inst);


  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);

  void applyWaitcnt(InstCounterType T, unsigned Count);

  void applyWaitcnt(const AMDGPU::Waitcnt &Wait, InstCounterType T);

  void updateByEvent(WaitEventType E, MachineInstr &MI);

  void recordAsyncMark(MachineInstr &MI);


  bool hasPendingEvent() const { return !PendingEvents.empty(); }

  bool hasPendingEvent(WaitEventType E) const {

    return PendingEvents.contains(E);

  }

  bool hasPendingEvent(InstCounterType T) const {

    bool HasPending = PendingEvents & Context->getWaitEvents(T);

    assert(HasPending == !empty(T) &&

           "Expected pending events iff scoreboard is not empty");

    return HasPending;

  }


  bool hasMixedPendingEvents(InstCounterType T) const {

    WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);

    // Return true if more than one bit is set in Events.

    return Events.twoOrMore();

  }


  bool hasPendingFlat() const {

    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&

             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||

            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&

             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));

  }


  void setPendingFlat() {

    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];

    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];

  }


  bool hasPendingGDS() const {

    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];

  }


  unsigned getPendingGDSWait() const {

    return std::min(getScoreUB(DS_CNT) - LastGDS,

                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);

  }


  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }


  // Return true if there might be pending writes to the vgpr-interval by VMEM

  // instructions with types different from V.

  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {

    for (MCRegUnit RU : regunits(Reg)) {

      auto It = VMem.find(toVMEMID(RU));

      if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))

        return true;

    }

    return false;

  }


  void clearVgprVmemTypes(MCPhysReg Reg) {

    for (MCRegUnit RU : regunits(Reg)) {

      if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {

        It->second.VMEMTypes = 0;

        if (It->second.empty())

          VMem.erase(It);

      }

    }

  }


  void setStateOnFunctionEntryOrReturn() {

    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +

                              getWaitCountMax(Context->getLimits(), STORE_CNT));

    PendingEvents |= Context->getWaitEvents(STORE_CNT);

  }


  ArrayRef<const MachineInstr *> getLDSDMAStores() const {

    return LDSDMAStores;

  }


  bool hasPointSampleAccel(const MachineInstr &MI) const;

  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,

                                      MCPhysReg RU) const;


  void print(raw_ostream &) const;

  void dump() const { print(dbgs()); }


  // Free up memory by removing empty entries from the DenseMap that track event

  // scores.

  void purgeEmptyTrackingData();


private:

  struct MergeInfo {

    unsigned OldLB;

    unsigned OtherLB;

    unsigned MyShift;

    unsigned OtherShift;

  };


  using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;


  void determineWaitForScore(InstCounterType T, unsigned Score,

                             AMDGPU::Waitcnt &Wait) const;


  static bool mergeScore(const MergeInfo &M, unsigned &Score,

                         unsigned OtherScore);

  bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,

                       ArrayRef<CounterValueArray> OtherMarks);


  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {

    assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");

    if (!Context->TRI.isInAllocatableClass(Reg))

      return {{}, {}};

    const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);

    unsigned Size = Context->TRI.getRegSizeInBits(*RC);

    if (Size == 16 && Context->ST.hasD16Writes32BitVgpr())

      Reg = Context->TRI.get32BitRegister(Reg);

    return Context->TRI.regunits(Reg);

  }


  void setScoreLB(InstCounterType T, unsigned Val) {

    assert(T < NUM_INST_CNTS);

    ScoreLBs[T] = Val;

  }


  void setScoreUB(InstCounterType T, unsigned Val) {

    assert(T < NUM_INST_CNTS);

    ScoreUBs[T] = Val;


    if (T != EXP_CNT)

      return;


    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))

      ScoreLBs[EXP_CNT] =

          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);

  }


  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {

    const SIRegisterInfo &TRI = Context->TRI;

    if (Reg == AMDGPU::SCC) {

      SCCScore = Val;

    } else if (TRI.isVectorRegister(Context->MRI, Reg)) {

      for (MCRegUnit RU : regunits(Reg))

        VMem[toVMEMID(RU)].Scores[T] = Val;

    } else if (TRI.isSGPRReg(Context->MRI, Reg)) {

      auto STy = getSgprScoresIdx(T);

      for (MCRegUnit RU : regunits(Reg))

        SGPRs[RU].Scores[STy] = Val;

    } else {

      llvm_unreachable("Register cannot be tracked/unknown register!");

    }

  }


  void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {

    VMem[TID].Scores[T] = Val;

  }


  void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,

                         unsigned Val);


  const SIInsertWaitcnts *Context;


  unsigned ScoreLBs[NUM_INST_CNTS] = {0};

  unsigned ScoreUBs[NUM_INST_CNTS] = {0};

  WaitEventSet PendingEvents;

  // Remember the last flat memory operation.

  unsigned LastFlat[NUM_INST_CNTS] = {0};

  // Remember the last GDS operation.

  unsigned LastGDS = 0;


  // The score tracking logic is fragmented as follows:

  // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.

  // - SGPRs: SGPR RegUnits

  // - SCC: Non-allocatable and not general purpose: not a SGPR.

  //

  // For the VMem case, if the key is within the range of LDS DMA IDs,

  // then the corresponding index into the `LDSDMAStores` vector below is:

  //   Key - LDSDMA_BEGIN - 1

  // This is because LDSDMA_BEGIN is a generic entry and does not have an

  // associated MachineInstr.

  //

  // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?


  struct VMEMInfo {

    // Scores for all instruction counters. Zero-initialized.

    CounterValueArray Scores{};

    // Bitmask of the VmemTypes of VMEM instructions for this VGPR.

    unsigned VMEMTypes = 0;


    bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }

  };


  struct SGPRInfo {

    // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt

    // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.

    // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps

    // the X_CNT score.

    std::array<unsigned, 2> Scores = {0};


    bool empty() const { return !Scores[0] && !Scores[1]; }

  };


  DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA

  DenseMap<MCRegUnit, SGPRInfo> SGPRs;


  // Reg score for SCC.

  unsigned SCCScore = 0;

  // The unique instruction that has an SCC write pending, if there is one.

  const MachineInstr *PendingSCCWrite = nullptr;


  // Store representative LDS DMA operations. The only useful info here is

  // alias info. One store is kept per unique AAInfo.

  SmallVector<const MachineInstr *> LDSDMAStores;


  // State of all counters at each async mark encountered so far.

  SmallVector<CounterValueArray> AsyncMarks;


  // But in the rare pathological case, a nest of loops that pushes marks

  // without waiting on any mark can cause AsyncMarks to grow very large. We cap

  // it to a reasonable limit. We can tune this later or potentially introduce a

  // user option to control the value.

  static constexpr unsigned MaxAsyncMarks = 16;


  // Track the upper bound score for async operations that are not part of a

  // mark yet. Initialized to all zeros.

  CounterValueArray AsyncScore{};

};


class SIInsertWaitcntsLegacy : public MachineFunctionPass {

public:

  static char ID;

  SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}


  bool runOnMachineFunction(MachineFunction &MF) override;


  StringRef getPassName() const override {

    return "SI insert wait instructions";

  }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    AU.addRequired<MachineLoopInfoWrapperPass>();

    AU.addRequired<MachinePostDominatorTreeWrapperPass>();

    AU.addUsedIfAvailable<AAResultsWrapperPass>();

    AU.addPreserved<AAResultsWrapperPass>();

    MachineFunctionPass::getAnalysisUsage(AU);

  }

};


} // end anonymous namespace


void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,

                                        InstCounterType CntTy, unsigned Score) {

  setRegScore(Op.getReg().asMCReg(), CntTy, Score);

}


// Return true if the subtarget is one that enables Point Sample Acceleration

// and the MachineInstr passed in is one to which it might be applied (the

// hardware makes this decision based on several factors, but we can't determine

// this at compile time, so we have to assume it might be applied if the

// instruction supports it).

bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {

  if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))

    return false;


  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());

  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =

      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);

  return BaseInfo->PointSampleAccel;

}


// Return true if the subtarget enables Point Sample Acceleration, the supplied

// MachineInstr is one to which it might be applied and the supplied interval is

// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER

// (this is the type that a point sample accelerated instruction effectively

// becomes)

bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,

                                                     MCPhysReg Reg) const {

  if (!hasPointSampleAccel(MI))

    return false;


  return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);

}


void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {

  InstCounterType T = Context->getCounterFromEvent(E);

  assert(T < Context->MaxCounter);


  unsigned UB = getScoreUB(T);

  unsigned CurrScore = UB + 1;

  if (CurrScore == 0)

    report_fatal_error("InsertWaitcnt score wraparound");

  // PendingEvents and ScoreUB need to be update regardless if this event

  // changes the score of a register or not.

  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.

  PendingEvents.insert(E);

  setScoreUB(T, CurrScore);


  const SIRegisterInfo &TRI = Context->TRI;

  const MachineRegisterInfo &MRI = Context->MRI;

  const SIInstrInfo &TII = Context->TII;


  if (T == EXP_CNT) {

    // Put score on the source vgprs. If this is a store, just use those

    // specific register(s).

    if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {

      // All GDS operations must protect their address register (same as

      // export.)

      if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))

        setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);


      if (Inst.mayStore()) {

        if (const auto *Data0 =

                TII.getNamedOperand(Inst, AMDGPU::OpName::data0))

          setScoreByOperand(*Data0, EXP_CNT, CurrScore);

        if (const auto *Data1 =

                TII.getNamedOperand(Inst, AMDGPU::OpName::data1))

          setScoreByOperand(*Data1, EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&

                 Inst.getOpcode() != AMDGPU::DS_APPEND &&

                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&

                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {

        for (const MachineOperand &Op : Inst.all_uses()) {

          if (TRI.isVectorRegister(MRI, Op.getReg()))

            setScoreByOperand(Op, EXP_CNT, CurrScore);

        }

      }

    } else if (TII.isFLAT(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII.isMIMG(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII.isMTBUF(Inst)) {

      if (Inst.mayStore())

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

    } else if (TII.isMUBUF(Inst)) {

      if (Inst.mayStore()) {

        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);

      } else if (SIInstrInfo::isAtomicRet(Inst)) {

        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),

                          EXP_CNT, CurrScore);

      }

    } else if (TII.isLDSDIR(Inst)) {

      // LDSDIR instructions attach the score to the destination.

      setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),

                        EXP_CNT, CurrScore);

    } else {

      if (TII.isEXP(Inst)) {

        // For export the destination registers are really temps that

        // can be used as the actual source after export patching, so

        // we need to treat them like sources and set the EXP_CNT

        // score.

        for (MachineOperand &DefMO : Inst.all_defs()) {

          if (TRI.isVGPR(MRI, DefMO.getReg())) {

            setScoreByOperand(DefMO, EXP_CNT, CurrScore);

          }

        }

      }

      for (const MachineOperand &Op : Inst.all_uses()) {

        if (TRI.isVectorRegister(MRI, Op.getReg()))

          setScoreByOperand(Op, EXP_CNT, CurrScore);

      }

    }

  } else if (T == X_CNT) {

    WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;

    if (PendingEvents.contains(OtherEvent)) {

      // Hardware inserts an implicit xcnt between interleaved

      // SMEM and VMEM operations. So there will never be

      // outstanding address translations for both SMEM and

      // VMEM at the same time.

      setScoreLB(T, getScoreUB(T) - 1);

      PendingEvents.remove(OtherEvent);

    }

    for (const MachineOperand &Op : Inst.all_uses())

      setScoreByOperand(Op, T, CurrScore);

  } else if (T == VA_VDST || T == VM_VSRC) {

    // Match the score to the VGPR destination or source registers as

    // appropriate

    for (const MachineOperand &Op : Inst.operands()) {

      if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||

          (T == VM_VSRC && Op.isDef()))

        continue;

      if (TRI.isVectorRegister(Context->MRI, Op.getReg()))

        setScoreByOperand(Op, T, CurrScore);

    }

  } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {

    // Match the score to the destination registers.

    //

    // Check only explicit operands. Stores, especially spill stores, include

    // implicit uses and defs of their super registers which would create an

    // artificial dependency, while these are there only for register liveness

    // accounting purposes.

    //

    // Special cases where implicit register defs exists, such as M0 or VCC,

    // but none with memory instructions.

    for (const MachineOperand &Op : Inst.defs()) {

      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {

        if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper

          continue;

        if (updateVMCntOnly(Inst)) {

          // updateVMCntOnly should only leave us with VGPRs

          // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR

          // defs. That's required for a sane index into `VgprMemTypes` below

          assert(TRI.isVectorRegister(MRI, Op.getReg()));

          VmemType V = getVmemType(Inst);

          unsigned char TypesMask = 1 << V;

          // If instruction can have Point Sample Accel applied, we have to flag

          // this with another potential dependency

          if (hasPointSampleAccel(Inst))

            TypesMask |= 1 << VMEM_NOSAMPLER;

          for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))

            VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;

        }

      }

      setScoreByOperand(Op, T, CurrScore);

    }

    if (Inst.mayStore() &&

        (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {

      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS

      // written can be accessed. A load from LDS to VMEM does not need a wait.

      //

      // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then

      // there is a MachineInstr in LDSDMAStores used to track this LDSDMA

      // store. The "Slot" is the index into LDSDMAStores + 1.

      unsigned Slot = 0;

      for (const auto *MemOp : Inst.memoperands()) {

        if (!MemOp->isStore() ||

            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)

          continue;

        // Comparing just AA info does not guarantee memoperands are equal

        // in general, but this is so for LDS DMA in practice.

        auto AAI = MemOp->getAAInfo();

        // Alias scope information gives a way to definitely identify an

        // original memory object and practically produced in the module LDS

        // lowering pass. If there is no scope available we will not be able

        // to disambiguate LDS aliasing as after the module lowering all LDS

        // is squashed into a single big object.

        if (!AAI || !AAI.Scope)

          break;

        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {

          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {

            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {

              Slot = I + 1;

              break;

            }

          }

        }

        if (Slot)

          break;

        // The slot may not be valid because it can be >= NUM_LDSDMA which

        // means the scoreboard cannot track it. We still want to preserve the

        // MI in order to check alias information, though.

        LDSDMAStores.push_back(&Inst);

        Slot = LDSDMAStores.size();

        break;

      }

      setVMemScore(LDSDMA_BEGIN, T, CurrScore);

      if (Slot && Slot < NUM_LDSDMA)

        setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);

    }


    // FIXME: Not supported on GFX12 yet. Newer async operations use other

    // counters too, so will need a map from instruction or event types to

    // counter types.

    if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {

      assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&

             "unexpected GFX1250 instruction");

      AsyncScore[T] = CurrScore;

    }


    if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {

      setRegScore(AMDGPU::SCC, T, CurrScore);

      PendingSCCWrite = &Inst;

    }

  }

}


void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {

  // In the absence of loops, AsyncMarks can grow linearly with the program

  // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a

  // limit every time we push a new mark, but that seems like unnecessary work

  // in practical cases. We do separately truncate the array when processing a

  // loop, which should be sufficient.

  AsyncMarks.push_back(AsyncScore);

  AsyncScore = {};

  LLVM_DEBUG({

    dbgs() << "recordAsyncMark:\n" << Inst;

    for (const auto &Mark : AsyncMarks) {

      llvm::interleaveComma(Mark, dbgs());

      dbgs() << '\n';

    }

  });

}


void WaitcntBrackets::print(raw_ostream &OS) const {

  const GCNSubtarget &ST = Context->ST;


  for (auto T : inst_counter_types(Context->MaxCounter)) {

    unsigned SR = getScoreRange(T);

    switch (T) {

    case LOAD_CNT:

      OS << "    " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("

         << SR << "):";

      break;

    case DS_CNT:

      OS << "    " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("

         << SR << "):";

      break;

    case EXP_CNT:

      OS << "    EXP_CNT(" << SR << "):";

      break;

    case STORE_CNT:

      OS << "    " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("

         << SR << "):";

      break;

    case SAMPLE_CNT:

      OS << "    SAMPLE_CNT(" << SR << "):";

      break;

    case BVH_CNT:

      OS << "    BVH_CNT(" << SR << "):";

      break;

    case KM_CNT:

      OS << "    KM_CNT(" << SR << "):";

      break;

    case X_CNT:

      OS << "    X_CNT(" << SR << "):";

      break;

    case VA_VDST:

      OS << "    VA_VDST(" << SR << "): ";

      break;

    case VM_VSRC:

      OS << "    VM_VSRC(" << SR << "): ";

      break;

    default:

      OS << "    UNKNOWN(" << SR << "):";

      break;

    }


    if (SR != 0) {

      // Print vgpr scores.

      unsigned LB = getScoreLB(T);


      SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());

      sort(SortedVMEMIDs);


      for (auto ID : SortedVMEMIDs) {

        unsigned RegScore = VMem.at(ID).Scores[T];

        if (RegScore <= LB)

          continue;

        unsigned RelScore = RegScore - LB - 1;

        if (ID < REGUNITS_END) {

          OS << ' ' << RelScore << ":vRU" << ID;

        } else {

          assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&

                 "Unhandled/unexpected ID value!");

          OS << ' ' << RelScore << ":LDSDMA" << ID;

        }

      }


      // Also need to print sgpr scores for lgkm_cnt or xcnt.

      if (isSmemCounter(T)) {

        SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());

        sort(SortedSMEMIDs);

        for (auto ID : SortedSMEMIDs) {

          unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];

          if (RegScore <= LB)

            continue;

          unsigned RelScore = RegScore - LB - 1;

          OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);

        }

      }


      if (T == KM_CNT && SCCScore > 0)

        OS << ' ' << SCCScore << ":scc";

    }

    OS << '\n';

  }


  OS << "Pending Events: ";

  if (hasPendingEvent()) {

    ListSeparator LS;

    for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {

      if (hasPendingEvent((WaitEventType)I)) {

        OS << LS << WaitEventTypeName[I];

      }

    }

  } else {

    OS << "none";

  }

  OS << '\n';


  OS << "Async score: ";

  if (AsyncScore.empty())

    OS << "none";

  else

    llvm::interleaveComma(AsyncScore, OS);

  OS << '\n';


  OS << "Async marks: " << AsyncMarks.size() << '\n';


  for (const auto &Mark : AsyncMarks) {

    for (auto T : inst_counter_types()) {

      unsigned MarkedScore = Mark[T];

      switch (T) {

      case LOAD_CNT:

        OS << "  " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")

           << "_CNT: " << MarkedScore;

        break;

      case DS_CNT:

        OS << "  " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")

           << "_CNT: " << MarkedScore;

        break;

      case EXP_CNT:

        OS << "  EXP_CNT: " << MarkedScore;

        break;

      case STORE_CNT:

        OS << "  " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")

           << "_CNT: " << MarkedScore;

        break;

      case SAMPLE_CNT:

        OS << "  SAMPLE_CNT: " << MarkedScore;

        break;

      case BVH_CNT:

        OS << "  BVH_CNT: " << MarkedScore;

        break;

      case KM_CNT:

        OS << "  KM_CNT: " << MarkedScore;

        break;

      case X_CNT:

        OS << "  X_CNT: " << MarkedScore;

        break;

      default:

        OS << "  UNKNOWN: " << MarkedScore;

        break;

      }

    }

    OS << '\n';

  }

  OS << '\n';

}


/// Simplify \p UpdateWait by removing waits that are redundant based on the

/// current WaitcntBrackets and any other waits specified in \p CheckWait.

void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,

                                      AMDGPU::Waitcnt &UpdateWait) const {

  simplifyWaitcnt(UpdateWait, LOAD_CNT);

  simplifyWaitcnt(UpdateWait, EXP_CNT);

  simplifyWaitcnt(UpdateWait, DS_CNT);

  simplifyWaitcnt(UpdateWait, STORE_CNT);

  simplifyWaitcnt(UpdateWait, SAMPLE_CNT);

  simplifyWaitcnt(UpdateWait, BVH_CNT);

  simplifyWaitcnt(UpdateWait, KM_CNT);

  simplifyXcnt(CheckWait, UpdateWait);

  simplifyWaitcnt(UpdateWait, VA_VDST);

  simplifyVmVsrc(CheckWait, UpdateWait);

}


void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,

                                      unsigned &Count) const {

  // The number of outstanding events for this type, T, can be calculated

  // as (UB - LB). If the current Count is greater than or equal to the number

  // of outstanding events, then the wait for this counter is redundant.

  if (Count >= getScoreRange(T))

    Count = ~0u;

}


void WaitcntBrackets::simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const {

  unsigned Cnt = Wait.get(T);

  simplifyWaitcnt(T, Cnt);

  Wait.set(T, Cnt);

}


void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,

                                   AMDGPU::Waitcnt &UpdateWait) const {

  // Try to simplify xcnt further by checking for joint kmcnt and loadcnt

  // optimizations. On entry to a block with multiple predescessors, there may

  // be pending SMEM and VMEM events active at the same time.

  // In such cases, only clear one active event at a time.

  // TODO: Revisit xcnt optimizations for gfx1250.

  // Wait on XCNT is redundant if we are already waiting for a load to complete.

  // SMEM can return out of order, so only omit XCNT wait if we are waiting till

  // zero.

  if (CheckWait.get(KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))

    UpdateWait.set(X_CNT, ~0u);

  // If we have pending store we cannot optimize XCnt because we do not wait for

  // stores. VMEM loads retun in order, so if we only have loads XCnt is

  // decremented to the same number as LOADCnt.

  if (CheckWait.get(LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&

      !hasPendingEvent(STORE_CNT) &&

      CheckWait.get(X_CNT) >= CheckWait.get(LOAD_CNT))

    UpdateWait.set(X_CNT, ~0u);

  simplifyWaitcnt(UpdateWait, X_CNT);

}


void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,

                                     AMDGPU::Waitcnt &UpdateWait) const {

  // Waiting for some counters implies waiting for VM_VSRC, since an

  // instruction that decrements a counter on completion would have

  // decremented VM_VSRC once its VGPR operands had been read.

  if (CheckWait.get(VM_VSRC) >=

      std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),

                CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),

                CheckWait.get(DS_CNT)}))

    UpdateWait.set(VM_VSRC, ~0u);

  simplifyWaitcnt(UpdateWait, VM_VSRC);

}


void WaitcntBrackets::purgeEmptyTrackingData() {

  for (auto &[K, V] : make_early_inc_range(VMem)) {

    if (V.empty())

      VMem.erase(K);

  }

  for (auto &[K, V] : make_early_inc_range(SGPRs)) {

    if (V.empty())

      SGPRs.erase(K);

  }

}


void WaitcntBrackets::determineWaitForScore(InstCounterType T,

                                            unsigned ScoreToWait,

                                            AMDGPU::Waitcnt &Wait) const {

  const unsigned LB = getScoreLB(T);

  const unsigned UB = getScoreUB(T);


  // If the score falls within the bracket, we need a waitcnt.

  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {

    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&

        !Context->ST.hasFlatLgkmVMemCountInOrder()) {

      // If there is a pending FLAT operation, and this is a VMem or LGKM

      // waitcnt and the target can report early completion, then we need

      // to force a waitcnt 0.

      addWait(Wait, T, 0);

    } else if (counterOutOfOrder(T)) {

      // Counter can get decremented out-of-order when there

      // are multiple types event in the bracket. Also emit an s_wait counter

      // with a conservative value of 0 for the counter.

      addWait(Wait, T, 0);

    } else {

      // If a counter has been maxed out avoid overflow by waiting for

      // MAX(CounterType) - 1 instead.

      unsigned NeededWait = std::min(

          UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);

      addWait(Wait, T, NeededWait);

    }

  }

}


AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {

  LLVM_DEBUG({

    dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()

           << ":\n";

    for (const auto &Mark : AsyncMarks) {

      llvm::interleaveComma(Mark, dbgs());

      dbgs() << '\n';

    }

  });


  if (AsyncMarks.size() == MaxAsyncMarks) {

    // Enforcing MaxAsyncMarks here is unnecessary work because the size of

    // MaxAsyncMarks is linear when traversing straightline code. But we do

    // need to check if truncation may have occured at a merge, and adjust N

    // to ensure that a wait is generated.

    LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");

    N = std::min(N, (unsigned)MaxAsyncMarks - 1);

  }


  AMDGPU::Waitcnt Wait;

  if (AsyncMarks.size() <= N) {

    LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");

    return Wait;

  }


  size_t MarkIndex = AsyncMarks.size() - N - 1;

  const auto &RequiredMark = AsyncMarks[MarkIndex];

  for (InstCounterType T : inst_counter_types())

    determineWaitForScore(T, RequiredMark[T], Wait);


  // Immediately remove the waited mark and all older ones

  // This happens BEFORE the wait is actually inserted, which is fine

  // because we've already extracted the wait requirements

  LLVM_DEBUG({

    dbgs() << "Removing " << (MarkIndex + 1)

           << " async marks after determining wait\n";

  });

  AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);


  LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);

  return Wait;

}


void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,

                                              AMDGPU::Waitcnt &Wait) const {

  if (Reg == AMDGPU::SCC) {

    determineWaitForScore(T, SCCScore, Wait);

  } else {

    bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);

    for (MCRegUnit RU : regunits(Reg))

      determineWaitForScore(

          T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),

          Wait);

  }

}


void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,

                                             AMDGPU::Waitcnt &Wait) const {

  assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);

  determineWaitForScore(T, getVMemScore(TID, T), Wait);

}


void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {

  // S_BARRIER_WAIT on the same barrier guarantees that the pending write to

  // SCC has landed

  if (PendingSCCWrite &&

      PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&

      PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {

    WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);

    // If this SCC_WRITE is the only pending KM_CNT event, clear counter.

    if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==

        SCC_WRITE_PendingEvent) {

      setScoreLB(KM_CNT, getScoreUB(KM_CNT));

    }


    PendingEvents.remove(SCC_WRITE_PendingEvent);

    PendingSCCWrite = nullptr;

  }

}


void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {

  for (InstCounterType T : inst_counter_types())

    applyWaitcnt(Wait, T);

}


void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {

  const unsigned UB = getScoreUB(T);

  if (Count >= UB)

    return;

  if (Count != 0) {

    if (counterOutOfOrder(T))

      return;

    setScoreLB(T, std::max(getScoreLB(T), UB - Count));

  } else {

    setScoreLB(T, UB);

    PendingEvents.remove(Context->getWaitEvents(T));

  }


  if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {

    if (!hasMixedPendingEvents(X_CNT))

      applyWaitcnt(X_CNT, 0);

    else

      PendingEvents.remove(SMEM_GROUP);

  }

  if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&

      !hasPendingEvent(STORE_CNT)) {

    if (!hasMixedPendingEvents(X_CNT))

      applyWaitcnt(X_CNT, Count);

    else if (Count == 0)

      PendingEvents.remove(VMEM_GROUP);

  }

}


void WaitcntBrackets::applyWaitcnt(const Waitcnt &Wait, InstCounterType T) {

  unsigned Cnt = Wait.get(T);

  applyWaitcnt(T, Cnt);

}


// Where there are multiple types of event in the bracket of a counter,

// the decrement may go out of order.

bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {

  // Scalar memory read always can go out of order.

  if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||

      (T == X_CNT && hasPendingEvent(SMEM_GROUP)))

    return true;


  // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),

  // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause

  // out-of-order completion.

  if (T == LOAD_CNT) {

    unsigned Events = hasPendingEvent(T);

    // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed

    // events

    Events &= ~(1 << GLOBAL_INV_ACCESS);

    // Return true only if there are still multiple event types after removing

    // GLOBAL_INV

    return Events & (Events - 1);

  }


  return hasMixedPendingEvents(T);

}


INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",

                      false, false)

INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)

INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",

                    false, false)


char SIInsertWaitcntsLegacy::ID = 0;


char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;


FunctionPass *llvm::createSIInsertWaitcntsPass() {

  return new SIInsertWaitcntsLegacy();

}


static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,

                                     unsigned NewEnc) {

  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);

  assert(OpIdx >= 0);


  MachineOperand &MO = MI.getOperand(OpIdx);


  if (NewEnc == MO.getImm())

    return false;


  MO.setImm(NewEnc);

  return true;

}


/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,

/// and if so, which counter it is waiting on.


static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {

  switch (Opcode) {

  case AMDGPU::S_WAIT_LOADCNT:

    return LOAD_CNT;

  case AMDGPU::S_WAIT_EXPCNT:

    return EXP_CNT;

  case AMDGPU::S_WAIT_STORECNT:

    return STORE_CNT;

  case AMDGPU::S_WAIT_SAMPLECNT:

    return SAMPLE_CNT;

  case AMDGPU::S_WAIT_BVHCNT:

    return BVH_CNT;

  case AMDGPU::S_WAIT_DSCNT:

    return DS_CNT;

  case AMDGPU::S_WAIT_KMCNT:

    return KM_CNT;

  case AMDGPU::S_WAIT_XCNT:

    return X_CNT;

  default:

    return {};

  }

}


bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {

  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());

  if (Opcode == Waitcnt->getOpcode())

    return false;


  Waitcnt->setDesc(TII.get(Opcode));

  return true;

}


/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that

/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits

/// from \p Wait that were added by previous passes. Currently this pass

/// conservatively assumes that these preexisting waits are required for

/// correctness.

bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(

    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {

  assert(isNormalMode(MaxCounter));


  bool Modified = false;

  MachineInstr *WaitcntInstr = nullptr;

  MachineInstr *WaitcntVsCntInstr = nullptr;


  LLVM_DEBUG({

    dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";

    if (It.isEnd())

      dbgs() << "end of block\n";

    else

      dbgs() << *It;

  });


  for (auto &II :

       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {

    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);

    if (II.isMetaInstruction()) {

      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");

      continue;

    }


    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());

    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;


    // Update required wait count. If this is a soft waitcnt (= it was added

    // by an earlier pass), it may be entirely removed.

    if (Opcode == AMDGPU::S_WAITCNT) {

      unsigned IEnc = II.getOperand(0).getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(OldWait);

      Wait = Wait.combined(OldWait);


      // Merge consecutive waitcnt of the same type by erasing multiples.

      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {

        II.eraseFromParent();

        Modified = true;

      } else

        WaitcntInstr = &II;

    } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

      assert(ST.hasVMemToLDSLoad());

      LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II

                        << "Before: " << Wait << '\n';);

      ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);

      LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);


      // It is possible (but unlikely) that this is the only wait instruction,

      // in which case, we exit this loop without a WaitcntInstr to consume

      // `Wait`. But that works because `Wait` was passed in by reference, and

      // the callee eventually calls createNewWaitcnt on it. We test this

      // possibility in an articial MIR test since such a situation cannot be

      // recreated by running the memory legalizer.

      II.eraseFromParent();

    } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {

      unsigned N = II.getOperand(0).getImm();

      LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);

      AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);

      Wait = Wait.combined(OldWait);

    } else {

      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);

      assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);


      unsigned OldVSCnt =

          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);

      Wait.set(STORE_CNT, std::min(Wait.get(STORE_CNT), OldVSCnt));


      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {

        II.eraseFromParent();

        Modified = true;

      } else

        WaitcntVsCntInstr = &II;

    }

  }


  if (WaitcntInstr) {

    Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,

                                         AMDGPU::encodeWaitcnt(IV, Wait));

    Modified |= promoteSoftWaitCnt(WaitcntInstr);


    ScoreBrackets.applyWaitcnt(Wait, LOAD_CNT);

    ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);

    ScoreBrackets.applyWaitcnt(Wait, DS_CNT);

    Wait.set(LOAD_CNT, ~0u);

    Wait.set(EXP_CNT, ~0u);

    Wait.set(DS_CNT, ~0u);


    LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"

                                   << "New Instr at block end: "

                                   << *WaitcntInstr << '\n'

                          : dbgs() << "applied pre-existing waitcnt\n"

                                   << "Old Instr: " << *It

                                   << "New Instr: " << *WaitcntInstr << '\n');

  }


  if (WaitcntVsCntInstr) {

    Modified |= updateOperandIfDifferent(

        *WaitcntVsCntInstr, AMDGPU::OpName::simm16, Wait.get(STORE_CNT));

    Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);


    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.get(STORE_CNT));

    Wait.set(STORE_CNT, ~0u);


    LLVM_DEBUG(It.isEnd()

                   ? dbgs() << "applied pre-existing waitcnt\n"

                            << "New Instr at block end: " << *WaitcntVsCntInstr

                            << '\n'

                   : dbgs() << "applied pre-existing waitcnt\n"

                            << "Old Instr: " << *It

                            << "New Instr: " << *WaitcntVsCntInstr << '\n');

  }


  return Modified;

}


/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any

/// required counters in \p Wait

bool WaitcntGeneratorPreGFX12::createNewWaitcnt(

    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,

    AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {

  assert(isNormalMode(MaxCounter));


  bool Modified = false;

  const DebugLoc &DL = Block.findDebugLoc(It);


  // Helper to emit expanded waitcnt sequence for profiling.

  // Emits waitcnts from (Outstanding-1) down to Target.

  // The EmitWaitcnt callback emits a single waitcnt.

  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,

                                 auto EmitWaitcnt) {

    do {

      EmitWaitcnt(--Outstanding);

    } while (Outstanding > Target);

    Modified = true;

  };


  // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a

  // single instruction while VScnt has its own instruction.

  if (Wait.hasWaitExceptStoreCnt()) {

    // If profiling expansion is enabled, emit an expanded sequence

    if (ExpandWaitcntProfiling) {

      // Check if any of the counters to be waited on are out-of-order.

      // If so, fall back to normal (non-expanded) behavior since expansion

      // would provide misleading profiling information.

      bool AnyOutOfOrder = false;

      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {

        unsigned WaitCnt = Wait.get(CT);

        if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {

          AnyOutOfOrder = true;

          break;

        }

      }


      if (AnyOutOfOrder) {

        // Fall back to non-expanded wait

        unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);

        BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);

        Modified = true;

      } else {

        // All counters are in-order, safe to expand

        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {

          unsigned WaitCnt = Wait.get(CT);

          if (WaitCnt == ~0u)

            continue;


          unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),

                                          getWaitCountMax(getLimits(), CT) - 1);

          EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {

            AMDGPU::Waitcnt W;

            W.set(CT, Count);

            BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))

                .addImm(AMDGPU::encodeWaitcnt(IV, W));

          });

        }

      }

    } else {

      // Normal behavior: emit single combined waitcnt

      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);

      [[maybe_unused]] auto SWaitInst =

          BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);

      Modified = true;


      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";

                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

                 dbgs() << "New Instr: " << *SWaitInst << '\n');

    }

  }


  if (Wait.hasWaitStoreCnt()) {

    assert(ST.hasVscnt());


    if (ExpandWaitcntProfiling && Wait.get(STORE_CNT) != ~0u &&

        !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {

      // Only expand if counter is not out-of-order

      unsigned Outstanding =

          std::min(ScoreBrackets.getOutstanding(STORE_CNT),

                   getWaitCountMax(getLimits(), STORE_CNT) - 1);

      EmitExpandedWaitcnt(

          Outstanding, Wait.get(STORE_CNT), [&](unsigned Count) {

            BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))

                .addReg(AMDGPU::SGPR_NULL, RegState::Undef)

                .addImm(Count);

          });

    } else {

      [[maybe_unused]] auto SWaitInst =

          BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))

              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)

              .addImm(Wait.get(STORE_CNT));

      Modified = true;


      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";

                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

                 dbgs() << "New Instr: " << *SWaitInst << '\n');

    }

  }


  return Modified;

}


AMDGPU::Waitcnt

WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {

  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);

}


AMDGPU::Waitcnt

WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {

  unsigned ExpertVal = IsExpertMode ? 0 : ~0u;

  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,

                         ~0u /* XCNT */, ExpertVal, ExpertVal);

}


/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and

/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that

/// were added by previous passes. Currently this pass conservatively

/// assumes that these preexisting waits are required for correctness.

bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(

    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {

  assert(!isNormalMode(MaxCounter));


  bool Modified = false;

  MachineInstr *CombinedLoadDsCntInstr = nullptr;

  MachineInstr *CombinedStoreDsCntInstr = nullptr;

  MachineInstr *WaitcntDepctrInstr = nullptr;

  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};


  LLVM_DEBUG({

    dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";

    if (It.isEnd())

      dbgs() << "end of block\n";

    else

      dbgs() << *It;

  });


  // Accumulate waits that should not be simplified.

  AMDGPU::Waitcnt RequiredWait;


  for (auto &II :

       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {

    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);

    if (II.isMetaInstruction()) {

      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");

      continue;

    }


    // Update required wait count. If this is a soft waitcnt (= it was added

    // by an earlier pass), it may be entirely removed.


    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());

    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;


    // Don't crash if the programmer used legacy waitcnt intrinsics, but don't

    // attempt to do more than that either.

    if (Opcode == AMDGPU::S_WAITCNT)

      continue;


    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {

      unsigned OldEnc =

          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);

      if (TrySimplify)

        Wait = Wait.combined(OldWait);

      else

        RequiredWait = RequiredWait.combined(OldWait);

      // Keep the first wait_loadcnt, erase the rest.

      if (CombinedLoadDsCntInstr == nullptr) {

        CombinedLoadDsCntInstr = &II;

      } else {

        II.eraseFromParent();

        Modified = true;

      }

    } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {

      unsigned OldEnc =

          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);

      if (TrySimplify)

        Wait = Wait.combined(OldWait);

      else

        RequiredWait = RequiredWait.combined(OldWait);

      // Keep the first wait_storecnt, erase the rest.

      if (CombinedStoreDsCntInstr == nullptr) {

        CombinedStoreDsCntInstr = &II;

      } else {

        II.eraseFromParent();

        Modified = true;

      }

    } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {

      unsigned OldEnc =

          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      AMDGPU::Waitcnt OldWait;

      OldWait.set(VA_VDST, AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc));

      OldWait.set(VM_VSRC, AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc));

      if (TrySimplify)

        ScoreBrackets.simplifyWaitcnt(OldWait);

      Wait = Wait.combined(OldWait);

      if (WaitcntDepctrInstr == nullptr) {

        WaitcntDepctrInstr = &II;

      } else {

        // S_WAITCNT_DEPCTR requires special care. Don't remove a

        // duplicate if it is waiting on things other than VA_VDST or

        // VM_VSRC. If that is the case, just make sure the VA_VDST and

        // VM_VSRC subfields of the operand are set to the "no wait"

        // values.


        unsigned Enc =

            TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

        Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);

        Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);


        if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {

          Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);

          Modified |= promoteSoftWaitCnt(&II);

        } else {

          II.eraseFromParent();

          Modified = true;

        }

      }

    } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

      // Architectures higher than GFX10 do not have direct loads to

      // LDS, so no work required here yet.

      II.eraseFromParent();

      Modified = true;

    } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {

      reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");

    } else {

      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);

      assert(CT.has_value());

      unsigned OldCnt =

          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

      if (TrySimplify)

        addWait(Wait, CT.value(), OldCnt);

      else

        addWait(RequiredWait, CT.value(), OldCnt);

      // Keep the first wait of its kind, erase the rest.

      if (WaitInstrs[CT.value()] == nullptr) {

        WaitInstrs[CT.value()] = &II;

      } else {

        II.eraseFromParent();

        Modified = true;

      }

    }

  }


  ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);

  Wait = Wait.combined(RequiredWait);


  if (CombinedLoadDsCntInstr) {

    // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need

    // to be waited for. Otherwise, let the instruction be deleted so

    // the appropriate single counter wait instruction can be inserted

    // instead, when new S_WAIT_*CNT instructions are inserted by

    // createNewWaitcnt(). As a side effect, resetting the wait counts will

    // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by

    // the loop below that deals with single counter instructions.

    //

    // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since

    // instructions that have decremented LOAD_CNT or DS_CNT on completion

    // will have needed to wait for their register sources to be available

    // first.

    if (Wait.get(LOAD_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {

      unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);

      Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,

                                           AMDGPU::OpName::simm16, NewEnc);

      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);

      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.get(LOAD_CNT));

      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.get(DS_CNT));

      Wait.set(LOAD_CNT, ~0u);

      Wait.set(DS_CNT, ~0u);


      LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"

                                     << "New Instr at block end: "

                                     << *CombinedLoadDsCntInstr << '\n'

                            : dbgs() << "applied pre-existing waitcnt\n"

                                     << "Old Instr: " << *It << "New Instr: "

                                     << *CombinedLoadDsCntInstr << '\n');

    } else {

      CombinedLoadDsCntInstr->eraseFromParent();

      Modified = true;

    }

  }


  if (CombinedStoreDsCntInstr) {

    // Similarly for S_WAIT_STORECNT_DSCNT.

    if (Wait.get(STORE_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {

      unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);

      Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,

                                           AMDGPU::OpName::simm16, NewEnc);

      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);

      ScoreBrackets.applyWaitcnt(Wait, STORE_CNT);

      ScoreBrackets.applyWaitcnt(Wait, DS_CNT);

      Wait.set(STORE_CNT, ~0u);

      Wait.set(DS_CNT, ~0u);


      LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"

                                     << "New Instr at block end: "

                                     << *CombinedStoreDsCntInstr << '\n'

                            : dbgs() << "applied pre-existing waitcnt\n"

                                     << "Old Instr: " << *It << "New Instr: "

                                     << *CombinedStoreDsCntInstr << '\n');

    } else {

      CombinedStoreDsCntInstr->eraseFromParent();

      Modified = true;

    }

  }


  // Look for an opportunity to convert existing S_WAIT_LOADCNT,

  // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT

  // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing

  // instructions so that createNewWaitcnt() will create new combined

  // instructions to replace them.


  if (Wait.get(DS_CNT) != ~0u) {

    // This is a vector of addresses in WaitInstrs pointing to instructions

    // that should be removed if they are present.

    SmallVector<MachineInstr **, 2> WaitsToErase;


    // If it's known that both DScnt and either LOADcnt or STOREcnt (but not

    // both) need to be waited for, ensure that there are no existing

    // individual wait count instructions for these.


    if (Wait.get(LOAD_CNT) != ~0u) {

      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);

      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

    } else if (Wait.get(STORE_CNT) != ~0u) {

      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);

      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

    }


    for (MachineInstr **WI : WaitsToErase) {

      if (!*WI)

        continue;


      (*WI)->eraseFromParent();

      *WI = nullptr;

      Modified = true;

    }

  }


  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

    if (!WaitInstrs[CT])

      continue;


    unsigned NewCnt = Wait.get(CT);

    if (NewCnt != ~0u) {

      Modified |= updateOperandIfDifferent(*WaitInstrs[CT],

                                           AMDGPU::OpName::simm16, NewCnt);

      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);


      ScoreBrackets.applyWaitcnt(CT, NewCnt);

      setNoWait(Wait, CT);


      LLVM_DEBUG(It.isEnd()

                     ? dbgs() << "applied pre-existing waitcnt\n"

                              << "New Instr at block end: " << *WaitInstrs[CT]

                              << '\n'

                     : dbgs() << "applied pre-existing waitcnt\n"

                              << "Old Instr: " << *It

                              << "New Instr: " << *WaitInstrs[CT] << '\n');

    } else {

      WaitInstrs[CT]->eraseFromParent();

      Modified = true;

    }

  }


  if (WaitcntDepctrInstr) {

    // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC

    // subfields with the new required values.

    unsigned Enc =

        TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)

            ->getImm();

    Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.get(VM_VSRC));

    Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.get(VA_VDST));


    ScoreBrackets.applyWaitcnt(VA_VDST, Wait.get(VA_VDST));

    ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.get(VM_VSRC));

    Wait.set(VA_VDST, ~0u);

    Wait.set(VM_VSRC, ~0u);


    // If that new encoded Depctr immediate would actually still wait

    // for anything, update the instruction's operand. Otherwise it can

    // just be deleted.

    if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {

      Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,

                                           AMDGPU::OpName::simm16, Enc);

      LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"

                                     << "New Instr at block end: "

                                     << *WaitcntDepctrInstr << '\n'

                            : dbgs() << "applyPreexistingWaitcnt\n"

                                     << "Old Instr: " << *It << "New Instr: "

                                     << *WaitcntDepctrInstr << '\n');

    } else {

      WaitcntDepctrInstr->eraseFromParent();

      Modified = true;

    }

  }


  return Modified;

}


/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait

bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(

    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,

    AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {

  assert(!isNormalMode(MaxCounter));


  bool Modified = false;

  const DebugLoc &DL = Block.findDebugLoc(It);


  // Helper to emit expanded waitcnt sequence for profiling.

  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,

                                 auto EmitWaitcnt) {

    for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)

      EmitWaitcnt(I);

    EmitWaitcnt(Target);

    Modified = true;

  };


  // For GFX12+, we use separate wait instructions, which makes expansion

  // simpler

  if (ExpandWaitcntProfiling) {

    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

      unsigned Count = Wait.get(CT);

      if (Count == ~0u)

        continue;


      // Skip expansion for out-of-order counters - emit normal wait instead

      if (ScoreBrackets.counterOutOfOrder(CT)) {

        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))

            .addImm(Count);

        Modified = true;

        continue;

      }


      unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),

                                      getWaitCountMax(getLimits(), CT) - 1);

      EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {

        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))

            .addImm(Val);

      });

    }

    return Modified;

  }


  // Normal behavior (no expansion)

  // Check for opportunities to use combined wait instructions.

  if (Wait.get(DS_CNT) != ~0u) {

    MachineInstr *SWaitInst = nullptr;


    if (Wait.get(LOAD_CNT) != ~0u) {

      unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);


      SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))

                      .addImm(Enc);


      Wait.set(LOAD_CNT, ~0u);

      Wait.set(DS_CNT, ~0u);

    } else if (Wait.get(STORE_CNT) != ~0u) {

      unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);


      SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))

                      .addImm(Enc);


      Wait.set(STORE_CNT, ~0u);

      Wait.set(DS_CNT, ~0u);

    }


    if (SWaitInst) {

      Modified = true;


      LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";

                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

                 dbgs() << "New Instr: " << *SWaitInst << '\n');

    }

  }


  // Generate an instruction for any remaining counter that needs

  // waiting for.


  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

    unsigned Count = Wait.get(CT);

    if (Count == ~0u)

      continue;


    [[maybe_unused]] auto SWaitInst =

        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))

            .addImm(Count);


    Modified = true;


    LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";

               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

               dbgs() << "New Instr: " << *SWaitInst << '\n');

  }


  if (Wait.hasWaitDepctr()) {

    assert(IsExpertMode);

    unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.get(VM_VSRC), ST);

    Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.get(VA_VDST));


    [[maybe_unused]] auto SWaitInst =

        BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);


    Modified = true;


    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";

               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

               dbgs() << "New Instr: " << *SWaitInst << '\n');

  }


  return Modified;

}


///  Generate s_waitcnt instruction to be placed before cur_Inst.

///  Instructions of a given type are returned in order,

///  but instructions of different types can complete out of order.

///  We rely on this in-order completion

///  and simply assign a score to the memory access instructions.

///  We keep track of the active "score bracket" to determine

///  if an access of a memory read requires an s_waitcnt

///  and if so what the value of each counter is.

///  The "score bracket" is bound by the lower bound and upper bound

///  scores (*_score_LB and *_score_ub respectively).

///  If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.

///  If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here

///  (GFX12+ only, where DS_CNT is a separate counter).

bool SIInsertWaitcnts::generateWaitcntInstBefore(

    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,

    MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {

  LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););

  setForceEmitWaitcnt();


  assert(!MI.isMetaInstruction());


  AMDGPU::Waitcnt Wait;

  const unsigned Opc = MI.getOpcode();


  switch (Opc) {

  case AMDGPU::BUFFER_WBINVL1:

  case AMDGPU::BUFFER_WBINVL1_SC:

  case AMDGPU::BUFFER_WBINVL1_VOL:

  case AMDGPU::BUFFER_GL0_INV:

  case AMDGPU::BUFFER_GL1_INV: {

    // FIXME: This should have already been handled by the memory legalizer.

    // Removing this currently doesn't affect any lit tests, but we need to

    // verify that nothing was relying on this. The number of buffer invalidates

    // being handled here should not be expanded.

    Wait.set(LOAD_CNT, 0);

    break;

  }

  case AMDGPU::SI_RETURN_TO_EPILOG:

  case AMDGPU::SI_RETURN:

  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:

  case AMDGPU::S_SETPC_B64_return: {

    // All waits must be resolved at call return.

    // NOTE: this could be improved with knowledge of all call sites or

    //   with knowledge of the called routines.

    ReturnInsts.insert(&MI);

    AMDGPU::Waitcnt AllZeroWait =

        WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);

    // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads

    // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.

    // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's

    // no need to wait for it at function boundaries.

    if (ST.hasExtendedWaitCounts() &&

        !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))

      AllZeroWait.set(LOAD_CNT, ~0u);

    Wait = AllZeroWait;

    break;

  }

  case AMDGPU::S_ENDPGM:

  case AMDGPU::S_ENDPGM_SAVED: {

    // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.

    // Technically the hardware will do this on its own if we don't, but that

    // might cost extra cycles compared to doing it explicitly.

    // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may

    // have to wait for outstanding VMEM stores. In this case it can be useful

    // to send a message to explicitly release all VGPRs before the stores have

    // completed, but it is only safe to do this if there are no outstanding

    // scratch stores.

    EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&

                       !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);

    break;

  }

  case AMDGPU::S_SENDMSG:

  case AMDGPU::S_SENDMSGHALT: {

    if (ST.hasLegacyGeometry() &&

        ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==

         AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {

      // Resolve vm waits before gs-done.

      Wait.set(LOAD_CNT, 0);

      break;

    }

    [[fallthrough]];

  }

  default: {


    // Export & GDS instructions do not read the EXEC mask until after the

    // export is granted (which can occur well after the instruction is issued).

    // The shader program must flush all EXP operations on the export-count

    // before overwriting the EXEC mask.

    if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {

      // Export and GDS are tracked individually, either may trigger a waitcnt

      // for EXEC.

      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||

          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||

          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||

          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {

        Wait.set(EXP_CNT, 0);

      }

    }


    // Wait for any pending GDS instruction to complete before any

    // "Always GDS" instruction.

    if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())

      addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());


    if (MI.isCall()) {

      // The function is going to insert a wait on everything in its prolog.

      // This still needs to be careful if the call target is a load (e.g. a GOT

      // load). We also need to check WAW dependency with saved PC.

      CallInsts.insert(&MI);

      Wait = AMDGPU::Waitcnt();


      const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);

      if (CallAddrOp.isReg()) {

        ScoreBrackets.determineWaitForPhysReg(

            SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);


        if (const auto *RtnAddrOp =

                TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {

          ScoreBrackets.determineWaitForPhysReg(

              SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);

        }

      }

    } else if (Opc == AMDGPU::S_BARRIER_WAIT) {

      ScoreBrackets.tryClearSCCWriteEvent(&MI);

    } else {

      // FIXME: Should not be relying on memoperands.

      // Look at the source operands of every instruction to see if

      // any of them results from a previous memory operation that affects

      // its current usage. If so, an s_waitcnt instruction needs to be

      // emitted.

      // If the source operand was defined by a load, add the s_waitcnt

      // instruction.

      //

      // Two cases are handled for destination operands:

      // 1) If the destination operand was defined by a load, add the s_waitcnt

      // instruction to guarantee the right WAW order.

      // 2) If a destination operand that was used by a recent export/store ins,

      // add s_waitcnt on exp_cnt to guarantee the WAR order.


      for (const MachineMemOperand *Memop : MI.memoperands()) {

        const Value *Ptr = Memop->getValue();

        if (Memop->isStore()) {

          if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {

            addWait(Wait, SmemAccessCounter, 0);

            if (PDT.dominates(MI.getParent(), It->second))

              SLoadAddresses.erase(It);

          }

        }

        unsigned AS = Memop->getAddrSpace();

        if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)

          continue;

        // No need to wait before load from VMEM to LDS.

        if (TII.mayWriteLDSThroughDMA(MI))

          continue;


        // LOAD_CNT is only relevant to vgpr or LDS.

        unsigned TID = LDSDMA_BEGIN;

        if (Ptr && Memop->getAAInfo()) {

          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();

          for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {

            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {

              if ((I + 1) >= NUM_LDSDMA) {

                // We didn't have enough slot to track this LDS DMA store, it

                // has been tracked using the common RegNo (FIRST_LDS_VGPR).

                ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);

                break;

              }


              ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);

            }

          }

        } else {

          ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);

        }

        if (Memop->isStore()) {

          ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);

        }

      }


      // Loop over use and def operands.

      for (const MachineOperand &Op : MI.operands()) {

        if (!Op.isReg())

          continue;


        // If the instruction does not read tied source, skip the operand.

        if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))

          continue;


        MCPhysReg Reg = Op.getReg().asMCReg();


        const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());

        if (IsVGPR) {

          // Implicit VGPR defs and uses are never a part of the memory

          // instructions description and usually present to account for

          // super-register liveness.

          // TODO: Most of the other instructions also have implicit uses

          // for the liveness accounting only.

          if (Op.isImplicit() && MI.mayLoadOrStore())

            continue;


          ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);

          if (Op.isDef())

            ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);

          // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the

          // previous write and this write are the same type of VMEM

          // instruction, in which case they are (in some architectures)

          // guaranteed to write their results in order anyway.

          // Additionally check instructions where Point Sample Acceleration

          // might be applied.

          if (Op.isUse() || !updateVMCntOnly(MI) ||

              ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||

              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||

              !ST.hasVmemWriteVgprInOrder()) {

            ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);

            ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);

            ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);

            ScoreBrackets.clearVgprVmemTypes(Reg);

          }


          if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {

            ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);

          }

          ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);

        } else if (Op.getReg() == AMDGPU::SCC) {

          ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);

        } else {

          ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);

        }


        if (ST.hasWaitXcnt() && Op.isDef())

          ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);

      }

    }

  }

  }


  // Ensure safety against exceptions from outstanding memory operations while

  // waiting for a barrier:

  //

  //  * Some subtargets safely handle backing off the barrier in hardware

  //    when an exception occurs.

  //  * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that

  //    there can be no outstanding memory operations during the wait.

  //  * Subtargets with split barriers don't need to back off the barrier; it

  //    is up to the trap handler to preserve the user barrier state correctly.

  //

  // In all other cases, ensure safety by ensuring that there are no outstanding

  // memory operations.

  if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&

      !ST.hasBackOffBarrier()) {

    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));

  }


  // TODO: Remove this work-around, enable the assert for Bug 457939

  //       after fixing the scheduler. Also, the Shader Compiler code is

  //       independent of target.

  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&

      ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {

    Wait.set(DS_CNT, 0);

  }


  // Verify that the wait is actually needed.

  ScoreBrackets.simplifyWaitcnt(Wait);


  // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that

  // waits on VA_VDST if the instruction it would precede is not a VALU

  // instruction, since hardware handles VALU->VGPR->VALU hazards in

  // expert scheduling mode.

  if (TII.isVALU(MI))

    Wait.set(VA_VDST, ~0u);


  // Since the translation for VMEM addresses occur in-order, we can apply the

  // XCnt if the current instruction is of VMEM type and has a memory

  // dependency with another VMEM instruction in flight.

  if (Wait.get(X_CNT) != ~0u && isVmemAccess(MI)) {

    ScoreBrackets.applyWaitcnt(Wait, X_CNT);

    Wait.set(X_CNT, ~0u);

  }


  // When forcing emit, we need to skip terminators because that would break the

  // terminators of the MBB if we emit a waitcnt between terminators.

  if (ForceEmitZeroFlag && !MI.isTerminator())

    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);


  // If we force waitcnt then update Wait accordingly.

  for (InstCounterType T : inst_counter_types()) {

    if (!ForceEmitWaitcnt[T])

      continue;

    Wait.set(T, 0);

  }


  if (FlushFlags.FlushVmCnt) {

    for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})

      Wait.set(T, 0);

  }


  if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))

    Wait.set(DS_CNT, 0);


  if (ForceEmitZeroLoadFlag && Wait.get(LOAD_CNT) != ~0u)

    Wait.set(LOAD_CNT, 0);


  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,

                         OldWaitcntInstr);

}


bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,

                                       MachineBasicBlock::instr_iterator It,

                                       MachineBasicBlock &Block,

                                       WaitcntBrackets &ScoreBrackets,

                                       MachineInstr *OldWaitcntInstr) {

  bool Modified = false;


  if (OldWaitcntInstr)

    // Try to merge the required wait with preexisting waitcnt instructions.

    // Also erase redundant waitcnt.

    Modified =

        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);


  // ExpCnt can be merged into VINTERP.

  if (Wait.get(EXP_CNT) != ~0u && It != Block.instr_end() &&

      SIInstrInfo::isVINTERP(*It)) {

    MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);

    if (Wait.get(EXP_CNT) < WaitExp->getImm()) {

      WaitExp->setImm(Wait.get(EXP_CNT));

      Modified = true;

    }

    // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.

    ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);

    Wait.set(EXP_CNT, ~0u);


    LLVM_DEBUG(dbgs() << "generateWaitcnt\n"

                      << "Update Instr: " << *It);

  }


  if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))

    Modified = true;


  // Any counts that could have been applied to any existing waitcnt

  // instructions will have been done so, now deal with any remaining.

  ScoreBrackets.applyWaitcnt(Wait);


  return Modified;

}


std::optional<WaitEventType>

SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {

  if (TII.isVALU(Inst)) {

    // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete

    // out-of-order with respect to each other, so each of these classes

    // has its own event.


    if (TII.isXDL(Inst))

      return VGPR_XDL_WRITE;


    if (TII.isTRANS(Inst))

      return VGPR_TRANS_WRITE;


    if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))

      return VGPR_DPMACC_WRITE;


    return VGPR_CSMACC_WRITE;

  }


  // FLAT and LDS instructions may read their VGPR sources out-of-order

  // with respect to each other and all other VMEM instructions, so

  // each of these also has a separate event.


  if (TII.isFLAT(Inst))

    return VGPR_FLAT_READ;


  if (TII.isDS(Inst))

    return VGPR_LDS_READ;


  if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))

    return VGPR_VMEM_READ;


  // Otherwise, no hazard.


  return {};

}


bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {

  return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||

         (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

}


// Return true if the next instruction is S_ENDPGM, following fallthrough

// blocks if necessary.

bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,

                                    MachineBasicBlock *Block) const {

  auto BlockEnd = Block->getParent()->end();

  auto BlockIter = Block->getIterator();


  while (true) {

    if (It.isEnd()) {

      if (++BlockIter != BlockEnd) {

        It = BlockIter->instr_begin();

        continue;

      }


      return false;

    }


    if (!It->isMetaInstruction())

      break;


    It++;

  }


  assert(!It.isEnd());


  return It->getOpcode() == AMDGPU::S_ENDPGM;

}


// Add a wait after an instruction if architecture requirements mandate one.

bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,

                                             MachineBasicBlock &Block,

                                             WaitcntBrackets &ScoreBrackets) {

  AMDGPU::Waitcnt Wait;

  bool NeedsEndPGMCheck = false;


  if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())

    Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&

                                  !SIInstrInfo::isAtomicRet(Inst));


  if (TII.isAlwaysGDS(Inst.getOpcode())) {

    Wait.set(DS_CNT, 0);

    NeedsEndPGMCheck = true;

  }


  ScoreBrackets.simplifyWaitcnt(Wait);


  auto SuccessorIt = std::next(Inst.getIterator());

  bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,

                                /*OldWaitcntInstr=*/nullptr);


  if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {

    BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))

        .addImm(0);

  }


  return Result;

}


WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {

  WaitEventSet Events;

  if (IsExpertMode) {

    if (const auto ET = getExpertSchedulingEventType(Inst))

      Events.insert(*ET);

  }


  if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {

    if (TII.isAlwaysGDS(Inst.getOpcode()) ||

        TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {

      Events.insert(GDS_ACCESS);

      Events.insert(GDS_GPR_LOCK);

    } else {

      Events.insert(LDS_ACCESS);

    }

  } else if (TII.isFLAT(Inst)) {

    if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {

      Events.insert(getVmemWaitEventType(Inst));

    } else {

      assert(Inst.mayLoadOrStore());

      if (TII.mayAccessVMEMThroughFlat(Inst)) {

        if (ST.hasWaitXcnt())

          Events.insert(VMEM_GROUP);

        Events.insert(getVmemWaitEventType(Inst));

      }

      if (TII.mayAccessLDSThroughFlat(Inst))

        Events.insert(LDS_ACCESS);

    }

  } else if (SIInstrInfo::isVMEM(Inst) &&

             (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||

              Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {

    // BUFFER_WBL2 is included here because unlike invalidates, has to be

    // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has

    // completed.

    if (ST.hasWaitXcnt())

      Events.insert(VMEM_GROUP);

    Events.insert(getVmemWaitEventType(Inst));

    if (ST.vmemWriteNeedsExpWaitcnt() &&

        (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {

      Events.insert(VMW_GPR_LOCK);

    }

  } else if (TII.isSMRD(Inst)) {

    if (ST.hasWaitXcnt())

      Events.insert(SMEM_GROUP);

    Events.insert(SMEM_ACCESS);

  } else if (SIInstrInfo::isLDSDIR(Inst)) {

    Events.insert(EXP_LDS_ACCESS);

  } else if (SIInstrInfo::isEXP(Inst)) {

    unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();

    if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)

      Events.insert(EXP_PARAM_ACCESS);

    else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)

      Events.insert(EXP_POS_ACCESS);

    else

      Events.insert(EXP_GPR_LOCK);

  } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {

    Events.insert(SCC_WRITE);

  } else {

    switch (Inst.getOpcode()) {

    case AMDGPU::S_SENDMSG:

    case AMDGPU::S_SENDMSG_RTN_B32:

    case AMDGPU::S_SENDMSG_RTN_B64:

    case AMDGPU::S_SENDMSGHALT:

      Events.insert(SQ_MESSAGE);

      break;

    case AMDGPU::S_MEMTIME:

    case AMDGPU::S_MEMREALTIME:

    case AMDGPU::S_GET_BARRIER_STATE_M0:

    case AMDGPU::S_GET_BARRIER_STATE_IMM:

      Events.insert(SMEM_ACCESS);

      break;

    }

  }

  return Events;

}


void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,

                                               WaitcntBrackets *ScoreBrackets) {


  WaitEventSet InstEvents = getEventsFor(Inst);

  for (WaitEventType E : wait_events()) {

    if (InstEvents.contains(E))

      ScoreBrackets->updateByEvent(E, Inst);

  }


  if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {

    if (TII.isAlwaysGDS(Inst.getOpcode()) ||

        TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {

      ScoreBrackets->setPendingGDS();

    }

  } else if (TII.isFLAT(Inst)) {

    if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&

        TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))

      // Async/LDSDMA operations have FLAT encoding but do not actually use flat

      // pointers. They do have two operands that each access global and LDS,

      // thus making it appear at this point that they are using a flat pointer.

      // Filter them out, and for the rest, generate a dependency on flat

      // pointers so that both VM and LGKM counters are flushed.

      ScoreBrackets->setPendingFlat();

  } else if (Inst.isCall()) {

    // Act as a wait on everything

    ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));

    ScoreBrackets->setStateOnFunctionEntryOrReturn();

  } else if (TII.isVINTERP(Inst)) {

    int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();

    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);

  }

}


bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,

                                 unsigned OtherScore) {

  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;

  unsigned OtherShifted =

      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;

  Score = std::max(MyShifted, OtherShifted);

  return OtherShifted > MyShifted;

}


bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,

                                      ArrayRef<CounterValueArray> OtherMarks) {

  bool StrictDom = false;


  LLVM_DEBUG(dbgs() << "Merging async marks ...");

  // Early exit: both empty

  if (AsyncMarks.empty() && OtherMarks.empty()) {

    LLVM_DEBUG(dbgs() << " nothing to merge\n");

    return false;

  }

  LLVM_DEBUG(dbgs() << '\n');


  // Determine maximum length needed after merging

  auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());

  MaxSize = std::min(MaxSize, MaxAsyncMarks);


  // Keep only the most recent marks within our limit.

  if (AsyncMarks.size() > MaxSize)

    AsyncMarks.erase(AsyncMarks.begin(),

                     AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));


  // Pad with zero-filled marks if our list is shorter. Zero represents "no

  // pending async operations at this checkpoint" and acts as the identity

  // element for max() during merging. We pad at the beginning since the marks

  // need to be aligned in most-recent order.

  constexpr CounterValueArray ZeroMark{};

  AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);


  LLVM_DEBUG({

    dbgs() << "Before merge:\n";

    for (const auto &Mark : AsyncMarks) {

      llvm::interleaveComma(Mark, dbgs());

      dbgs() << '\n';

    }

    dbgs() << "Other marks:\n";

    for (const auto &Mark : OtherMarks) {

      llvm::interleaveComma(Mark, dbgs());

      dbgs() << '\n';

    }

  });


  // Merge element-wise using the existing mergeScore function and the

  // appropriate MergeInfo for each counter type. Iterate only while we have

  // elements in both vectors.

  unsigned OtherSize = OtherMarks.size();

  unsigned OurSize = AsyncMarks.size();

  unsigned MergeCount = std::min(OtherSize, OurSize);

  for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {

    for (auto T : inst_counter_types(Context->MaxCounter)) {

      StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],

                              OtherMarks[OtherSize - Idx][T]);

    }

  }


  LLVM_DEBUG({

    dbgs() << "After merge:\n";

    for (const auto &Mark : AsyncMarks) {

      llvm::interleaveComma(Mark, dbgs());

      dbgs() << '\n';

    }

  });


  return StrictDom;

}


/// Merge the pending events and associater score brackets of \p Other into

/// this brackets status.

///

/// Returns whether the merge resulted in a change that requires tighter waits

/// (i.e. the merged brackets strictly dominate the original brackets).

bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {

  bool StrictDom = false;


  // Check if "other" has keys we don't have, and create default entries for

  // those. If they remain empty after merging, we will clean it up after.

  for (auto K : Other.VMem.keys())

    VMem.try_emplace(K);

  for (auto K : Other.SGPRs.keys())

    SGPRs.try_emplace(K);


  // Array to store MergeInfo for each counter type

  MergeInfo MergeInfos[NUM_INST_CNTS];


  for (auto T : inst_counter_types(Context->MaxCounter)) {

    // Merge event flags for this counter

    const WaitEventSet &EventsForT = Context->getWaitEvents(T);

    const WaitEventSet OldEvents = PendingEvents & EventsForT;

    const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;

    if (!OldEvents.contains(OtherEvents))

      StrictDom = true;

    PendingEvents |= OtherEvents;


    // Merge scores for this counter

    const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];

    const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];

    const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);

    if (NewUB < ScoreLBs[T])

      report_fatal_error("waitcnt score overflow");


    MergeInfo &M = MergeInfos[T];

    M.OldLB = ScoreLBs[T];

    M.OtherLB = Other.ScoreLBs[T];

    M.MyShift = NewUB - ScoreUBs[T];

    M.OtherShift = NewUB - Other.ScoreUBs[T];


    ScoreUBs[T] = NewUB;


    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);


    if (T == DS_CNT)

      StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);


    if (T == KM_CNT) {

      StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);

      if (Other.hasPendingEvent(SCC_WRITE)) {

        if (!OldEvents.contains(SCC_WRITE)) {

          PendingSCCWrite = Other.PendingSCCWrite;

        } else if (PendingSCCWrite != Other.PendingSCCWrite) {

          PendingSCCWrite = nullptr;

        }

      }

    }


    for (auto &[RegID, Info] : VMem)

      StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));


    if (isSmemCounter(T)) {

      unsigned Idx = getSgprScoresIdx(T);

      for (auto &[RegID, Info] : SGPRs) {

        auto It = Other.SGPRs.find(RegID);

        unsigned OtherScore =

            (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;

        StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);

      }

    }

  }


  for (auto &[TID, Info] : VMem) {

    if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {

      unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;

      StrictDom |= NewVmemTypes != Info.VMEMTypes;

      Info.VMEMTypes = NewVmemTypes;

    }

  }


  StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);

  for (auto T : inst_counter_types(Context->MaxCounter))

    StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);


  purgeEmptyTrackingData();

  return StrictDom;

}


static bool isWaitInstr(MachineInstr &Inst) {

  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());

  return Opcode == AMDGPU::S_WAITCNT ||

         (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&

          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||

         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||

         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||

         Opcode == AMDGPU::S_WAITCNT_lds_direct ||

         Opcode == AMDGPU::WAIT_ASYNCMARK ||

         counterTypeForInstr(Opcode).has_value();

}


void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,

                                         MachineBasicBlock::iterator I,

                                         bool ExpertMode) const {

  const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(

      AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2);

  BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))

      .addImm(ExpertMode ? 2 : 0)

      .addImm(EncodedReg);

}


namespace {

// TODO: Remove this work-around after fixing the scheduler.

// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()

// and ST.partialVCCWritesUpdateVCCZ().

// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may

//    corrupt vccz bit, so when we detect that an instruction may read from

//    a corrupt vccz bit, we need to:

//   1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD

//      operations to complete.

//   2. Recompute the correct value of vccz by writing the current value

//      of vcc back to vcc.

// ii. Partial writes to vcc don't update vccz, so we need to recompute the

//     correct value of vccz by reading vcc and writing it back to vcc.

//     No waitcnt is needed in this case.

class VCCZWorkaround {

  const WaitcntBrackets &ScoreBrackets;

  const GCNSubtarget &ST;

  const SIInstrInfo &TII;

  const SIRegisterInfo &TRI;

  bool VCCZCorruptionBug = false;

  bool VCCZNotUpdatedByPartialWrites = false;

  /// vccz could be incorrect at a basic block boundary if a predecessor wrote

  /// to vcc and then issued an smem load, so initialize to true.

  bool MustRecomputeVCCZ = true;


public:

  VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,

                 const SIInstrInfo &TII, const SIRegisterInfo &TRI)

      : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {

    VCCZCorruptionBug = ST.hasReadVCCZBug();

    VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();

  }

  /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,

  /// then emit a vccz recompute instruction before \p MI. This needs to be

  /// called on every instruction in the basic block because it also tracks the

  /// state and updates MustRecomputeVCCZ accordingly. Returns true if it

  /// modified the IR.

  bool tryRecomputeVCCZ(MachineInstr &MI) {

    // No need to run this if neither bug is present.

    if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)

      return false;


    // If MI is an SMEM and it can corrupt vccz on this target, then we need

    // both to emit a waitcnt and to recompute vccz.

    // But we don't actually emit a waitcnt here. This is done in

    // generateWaitcntInstBefore() because it tracks all the necessary waitcnt

    // state, and can either skip emitting a waitcnt if there is already one in

    // the IR, or emit an "optimized" combined waitcnt.

    // If this is an smem read, it could complete and clobber vccz at any time.

    MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);


    // If the target partial vcc writes don't update vccz, and MI is such an

    // instruction then we must recompute vccz.

    // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling

    // `definesRegister()` more than needed, because it's not very cheap.

    std::optional<bool> PartiallyWritesToVCCOpt;

    auto PartiallyWritesToVCC = [](MachineInstr &MI) {

      return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||

             MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);

    };

    if (VCCZNotUpdatedByPartialWrites) {

      PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);

      // If this is a partial VCC write but won't update vccz, then we must

      // recompute vccz.

      MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;

    }


    // If MI is a vcc write with no pending smem, or there is a pending smem

    // but the target does not suffer from the vccz corruption bug, then we

    // don't need to recompute vccz as this write will recompute it anyway.

    if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {

      // Compute PartiallyWritesToVCCOpt if we haven't done so already.

      if (!PartiallyWritesToVCCOpt)

        PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);

      bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&

                              MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);

      // If we write to the full vcc or we write partially and the target

      // updates vccz on partial writes, then vccz will be updated correctly.

      bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&

                                              *PartiallyWritesToVCCOpt);

      if (UpdatesVCCZ)

        MustRecomputeVCCZ = false;

    }


    // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz

    // restore instruction if either is needed.

    if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {

      // Recompute the vccz bit. Any time a value is written to vcc, the vccz

      // bit is updated, so we can restore the bit by reading the value of vcc

      // and then writing it back to the register.

      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),

              TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),

              TRI.getVCC())

          .addReg(TRI.getVCC());

      MustRecomputeVCCZ = false;

      return true;

    }

    return false;

  }

};


} // namespace


// Generate s_waitcnt instructions where needed.

bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

                                            MachineBasicBlock &Block,

                                            WaitcntBrackets &ScoreBrackets) {

  bool Modified = false;


  LLVM_DEBUG({

    dbgs() << "*** Begin Block: ";

    Block.printName(dbgs());

    ScoreBrackets.dump();

  });

  VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);


  // Walk over the instructions.

  MachineInstr *OldWaitcntInstr = nullptr;


  // NOTE: We may append instrs after Inst while iterating.

  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),

                                         E = Block.instr_end();

       Iter != E; ++Iter) {

    MachineInstr &Inst = *Iter;

    if (Inst.isMetaInstruction())

      continue;

    // Track pre-existing waitcnts that were added in earlier iterations or by

    // the memory legalizer.

    if (isWaitInstr(Inst) ||

        (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {

      if (!OldWaitcntInstr)

        OldWaitcntInstr = &Inst;

      continue;

    }


    PreheaderFlushFlags FlushFlags;

    if (Block.getFirstTerminator() == Inst)

      FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);


    // Generate an s_waitcnt instruction to be placed before Inst, if needed.

    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,

                                          FlushFlags);

    OldWaitcntInstr = nullptr;


    if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {

      // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.

      //

      // Asyncmarks record the current wait state and so should not allow

      // waitcnts that occur after them to be merged into waitcnts that occur

      // before.

      assert(ST.getGeneration() < AMDGPUSubtarget::GFX12);

      ScoreBrackets.recordAsyncMark(Inst);

      continue;

    }


    if (TII.isSMRD(Inst)) {

      for (const MachineMemOperand *Memop : Inst.memoperands()) {

        // No need to handle invariant loads when avoiding WAR conflicts, as

        // there cannot be a vector store to the same memory location.

        if (!Memop->isInvariant()) {

          const Value *Ptr = Memop->getValue();

          SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));

        }

      }

    }


    updateEventWaitcntAfter(Inst, &ScoreBrackets);


    // Note: insertForcedWaitAfter() may add instrs after Iter that need to be

    // visited by the loop.

    Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);


    LLVM_DEBUG({

      Inst.print(dbgs());

      ScoreBrackets.dump();

    });


    // If the target suffers from the vccz bugs, this may emit the necessary

    // vccz recompute instruction before \p Inst if needed.

    Modified |= VCCZW.tryRecomputeVCCZ(Inst);

  }


  // Flush counters at the end of the block if needed (for preheaders with no

  // terminator).

  AMDGPU::Waitcnt Wait;

  if (Block.getFirstTerminator() == Block.end()) {

    PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);

    if (FlushFlags.FlushVmCnt) {

      if (ScoreBrackets.hasPendingEvent(LOAD_CNT))

        Wait.set(LOAD_CNT, 0);

      if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))

        Wait.set(SAMPLE_CNT, 0);

      if (ScoreBrackets.hasPendingEvent(BVH_CNT))

        Wait.set(BVH_CNT, 0);

    }

    if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))

      Wait.set(DS_CNT, 0);

  }


  // Combine or remove any redundant waitcnts at the end of the block.

  Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,

                              OldWaitcntInstr);


  LLVM_DEBUG({

    dbgs() << "*** End Block: ";

    Block.printName(dbgs());

    ScoreBrackets.dump();

  });


  return Modified;

}


bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {

  if (Block.size() <= 1)

    return false;

  // The Memory Legalizer conservatively inserts a soft xcnt before each

  // atomic RMW operation. However, for sequences of back-to-back atomic

  // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away

  // the redundant soft xcnts.

  bool Modified = false;

  // Remember the last atomic with a soft xcnt right before it.

  MachineInstr *LastAtomicWithSoftXcnt = nullptr;


  for (MachineInstr &MI : drop_begin(Block)) {

    // Ignore last atomic if non-LDS VMEM and SMEM.

    bool IsLDS =

        TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));

    if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))

      LastAtomicWithSoftXcnt = nullptr;


    bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&

                       MI.mayLoad() && MI.mayStore();

    MachineInstr &PrevMI = *MI.getPrevNode();

    // This is an atomic with a soft xcnt.

    if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {

      // If we have already found an atomic with a soft xcnt, remove this soft

      // xcnt as it's redundant.

      if (LastAtomicWithSoftXcnt) {

        PrevMI.eraseFromParent();

        Modified = true;

      }

      LastAtomicWithSoftXcnt = &MI;

    }

  }

  return Modified;

}


// Return flags indicating which counters should be flushed in the preheader.

PreheaderFlushFlags

SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,

                                     const WaitcntBrackets &ScoreBrackets) {

  auto [Iterator, IsInserted] =

      PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());

  if (!IsInserted)

    return Iterator->second;


  MachineBasicBlock *Succ = MBB.getSingleSuccessor();

  if (!Succ)

    return PreheaderFlushFlags();


  MachineLoop *Loop = MLI.getLoopFor(Succ);

  if (!Loop)

    return PreheaderFlushFlags();


  if (Loop->getLoopPreheader() == &MBB) {

    Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);

    return Iterator->second;

  }


  return PreheaderFlushFlags();

}


bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {

  if (SIInstrInfo::isFLAT(MI))

    return TII.mayAccessVMEMThroughFlat(MI);

  return SIInstrInfo::isVMEM(MI);

}


bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {

  return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();

}


// Check if instruction is a store to LDS that is counted via DSCNT

// (where that counter exists).

bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {

  return MI.mayStore() && SIInstrInfo::isDS(MI);

}


// Return flags indicating which counters should be flushed in the preheader of

// the given loop. We currently decide to flush in the following situations:

// For VMEM (FlushVmCnt):

// 1. The loop contains vmem store(s), no vmem load and at least one use of a

//    vgpr containing a value that is loaded outside of the loop. (Only on

//    targets with no vscnt counter).

// 2. The loop contains vmem load(s), but the loaded values are not used in the

//    loop, and at least one use of a vgpr containing a value that is loaded

//    outside of the loop.

// For DS (FlushDsCnt, GFX12+ only):

// 3. The loop contains no DS reads, and at least one use of a vgpr containing

//    a value that is DS read outside of the loop.

// 4. The loop contains DS read(s), loaded values are not used in the same

//    iteration but in the next iteration (prefetch pattern), and at least one

//    use of a vgpr containing a value that is DS read outside of the loop.

//    Flushing in preheader reduces wait overhead if the wait requirement in

//    iteration 1 would otherwise be more strict (but unfortunately preheader

//    flush decision is taken before knowing that).

// 5. (Single-block loops only) The loop has DS prefetch reads with flush point

//    tracking. Some DS reads may be used in the same iteration (creating

//    "flush points"), but others remain unflushed at the backedge. When a DS

//    read is consumed in the same iteration, it and all prior reads are

//    "flushed" (FIFO order). No DS writes are allowed in the loop.

//    TODO: Find a way to extend to multi-block loops.

PreheaderFlushFlags

SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,

                                         const WaitcntBrackets &Brackets) {

  PreheaderFlushFlags Flags;

  bool HasVMemLoad = false;

  bool HasVMemStore = false;

  bool UsesVgprVMEMLoadedOutside = false;

  bool UsesVgprDSReadOutside = false;

  bool VMemInvalidated = false;

  // DS optimization only applies to GFX12+ where DS_CNT is separate.

  // Tracking status for "no DS read in loop" or "pure DS prefetch

  // (use only in next iteration)".

  bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();

  DenseSet<MCRegUnit> VgprUse;

  DenseSet<MCRegUnit> VgprDefVMEM;

  DenseSet<MCRegUnit> VgprDefDS;


  // Track DS reads for prefetch pattern with flush points (single-block only).

  // Keeps track of the last DS read (position counted from the top of the loop)

  // to each VGPR. Read is considered consumed (and thus needs flushing) if

  // the dest register has a use or is overwritten (by any later opertions).

  DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;

  unsigned DSReadPosition = 0;

  bool IsSingleBlock = ML->getNumBlocks() == 1;

  bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;

  unsigned LastDSFlushPosition = 0;


  for (MachineBasicBlock *MBB : ML->blocks()) {

    for (MachineInstr &MI : *MBB) {

      if (isVMEMOrFlatVMEM(MI)) {

        HasVMemLoad |= MI.mayLoad();

        HasVMemStore |= MI.mayStore();

      }

      // TODO: Can we relax DSStore check? There may be cases where

      // these DS stores are drained prior to the end of MBB (or loop).

      if (mayStoreIncrementingDSCNT(MI)) {

        // Early exit if none of the optimizations are feasible.

        // Otherwise, set tracking status appropriately and continue.

        if (VMemInvalidated)

          return Flags;

        TrackSimpleDSOpt = false;

        TrackDSFlushPoint = false;

      }

      bool IsDSRead = isDSRead(MI);

      if (IsDSRead)

        ++DSReadPosition;


      // Helper: if RU has a pending DS read, update LastDSFlushPosition

      auto updateDSReadFlushTracking = [&](MCRegUnit RU) {

        if (!TrackDSFlushPoint)

          return;

        if (auto It = LastDSReadPositionMap.find(RU);

            It != LastDSReadPositionMap.end()) {

          // RU defined by DSRead is used or overwritten. Need to complete

          // the read, if not already implied by a later DSRead (to any RU)

          // needing to complete in FIFO order.

          LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);

        }

      };


      for (const MachineOperand &Op : MI.all_uses()) {

        if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))

          continue;

        // Vgpr use

        for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {

          // If we find a register that is loaded inside the loop, 1. and 2.

          // are invalidated.

          if (VgprDefVMEM.contains(RU))

            VMemInvalidated = true;


          // Check for DS reads used inside the loop

          if (VgprDefDS.contains(RU))

            TrackSimpleDSOpt = false;


          // Early exit if all optimizations are invalidated

          if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)

            return Flags;


          // Check for flush points (DS read used in same iteration)

          updateDSReadFlushTracking(RU);


          VgprUse.insert(RU);

          // Check if this register has a pending VMEM load from outside the

          // loop (value loaded outside and used inside).

          VMEMID ID = toVMEMID(RU);

          if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||

              Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||

              Brackets.hasPendingVMEM(ID, BVH_CNT))

            UsesVgprVMEMLoadedOutside = true;

          // Check if loaded outside the loop via DS (not VMEM/FLAT).

          // Only consider it a DS read if there's no pending VMEM load for

          // this register, since FLAT can set both counters.

          else if (Brackets.hasPendingVMEM(ID, DS_CNT))

            UsesVgprDSReadOutside = true;

        }

      }


      // VMem load vgpr def

      if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {

        for (const MachineOperand &Op : MI.all_defs()) {

          for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {

            // If we find a register that is loaded inside the loop, 1. and 2.

            // are invalidated.

            if (VgprUse.contains(RU))

              VMemInvalidated = true;

            VgprDefVMEM.insert(RU);

          }

        }

        // Early exit if all optimizations are invalidated

        if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)

          return Flags;

      }


      // DS read vgpr def

      // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).

      // If USE comes before DEF, it's the prefetch pattern (use value from

      // previous iteration, read for next iteration). We should still flush

      // in preheader so iteration 1 doesn't need to wait inside the loop.

      // Only invalidate when DEF comes before USE (same-iteration consumption,

      // checked above when processing uses).

      if (IsDSRead || TrackDSFlushPoint) {

        for (const MachineOperand &Op : MI.all_defs()) {

          if (!TRI.isVectorRegister(MRI, Op.getReg()))

            continue;

          for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {

            // Check for overwrite of pending DS read (flush point) by any

            // instruction

            updateDSReadFlushTracking(RU);

            if (IsDSRead) {

              VgprDefDS.insert(RU);

              if (TrackDSFlushPoint)

                LastDSReadPositionMap[RU] = DSReadPosition;

            }

          }

        }

      }

    }

  }


  // VMEM flush decision

  if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&

      ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||

       (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))

    Flags.FlushVmCnt = true;


  // DS flush decision:

  // Simple DS Opt: flush if loop uses DS read values from outside

  // and either has no DS reads in the loop, or DS reads whose results

  // are not used in the loop.

  bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;

  // Prefetch with flush points: some DS reads used in same iteration,

  // but unflushed reads remain at backedge

  bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;

  bool DSFlushPointPrefetch =

      TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;


  if (SimpleDSOpt || DSFlushPointPrefetch)

    Flags.FlushDsCnt = true;


  return Flags;

}


bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {

  auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();

  auto &PDT =

      getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();

  AliasAnalysis *AA = nullptr;

  if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())

    AA = &AAR->getAAResults();


  return SIInsertWaitcnts(MLI, PDT, AA, MF).run();

}


PreservedAnalyses


SIInsertWaitcntsPass::run(MachineFunction &MF,

                          MachineFunctionAnalysisManager &MFAM) {

  auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);

  auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);

  auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)

                 .getManager()

                 .getCachedResult<AAManager>(MF.getFunction());


  if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())

    return PreservedAnalyses::all();


  return getMachineFunctionPassPreservedAnalyses()

      .preserveSet<CFGAnalyses>()

      .preserve<AAManager>();

}


bool SIInsertWaitcnts::run() {

  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();


  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());


  // Initialize hardware limits first, as they're needed by the generators.

  Limits = AMDGPU::HardwareLimits(IV);


  if (ST.hasExtendedWaitCounts()) {

    IsExpertMode = ST.hasExpertSchedulingMode() &&

                   (ExpertSchedulingModeFlag.getNumOccurrences()

                        ? ExpertSchedulingModeFlag

                        : MF.getFunction()

                              .getFnAttribute("amdgpu-expert-scheduling-mode")

                              .getValueAsBool());

    MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;

    // Initialize WCG per MF. It contains state that depends on MF attributes.

    WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,

                                                      IsExpertMode);

  } else {

    MaxCounter = NUM_NORMAL_INST_CNTS;

    // Initialize WCG per MF. It contains state that depends on MF attributes.

    WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,

                                                     Limits);

  }


  SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);


  bool Modified = false;


  MachineBasicBlock &EntryBB = MF.front();


  if (!MFI->isEntryFunction()) {

    // Wait for any outstanding memory operations that the input registers may

    // depend on. We can't track them and it's better to do the wait after the

    // costly call sequence.


    // TODO: Could insert earlier and schedule more liberally with operations

    // that only use caller preserved registers.

    MachineBasicBlock::iterator I = EntryBB.begin();

    while (I != EntryBB.end() && I->isMetaInstruction())

      ++I;


    if (ST.hasExtendedWaitCounts()) {

      BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))

          .addImm(0);

      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)

          continue;


        if (!ST.hasImageInsts() &&

            (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))

          continue;


        BuildMI(EntryBB, I, DebugLoc(),

                TII.get(instrsForExtendedCounterTypes[CT]))

            .addImm(0);

      }

      if (IsExpertMode) {

        unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);

        Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);

        BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))

            .addImm(Enc);

      }

    } else {

      BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);

    }


    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);

    NonKernelInitialState->setStateOnFunctionEntryOrReturn();

    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);


    Modified = true;

  }


  // Keep iterating over the blocks in reverse post order, inserting and

  // updating s_waitcnt where needed, until a fix point is reached.

  for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))

    BlockInfos.try_emplace(MBB);


  std::unique_ptr<WaitcntBrackets> Brackets;

  bool Repeat;

  do {

    Repeat = false;


    for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;

         ++BII) {

      MachineBasicBlock *MBB = BII->first;

      BlockInfo &BI = BII->second;

      if (!BI.Dirty)

        continue;


      if (BI.Incoming) {

        if (!Brackets)

          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);

        else

          *Brackets = *BI.Incoming;

      } else {

        if (!Brackets) {

          Brackets = std::make_unique<WaitcntBrackets>(this);

        } else {

          // Reinitialize in-place. N.B. do not do this by assigning from a

          // temporary because the WaitcntBrackets class is large and it could

          // cause this function to use an unreasonable amount of stack space.

          Brackets->~WaitcntBrackets();

          new (Brackets.get()) WaitcntBrackets(this);

        }

      }


      if (ST.hasWaitXcnt())

        Modified |= removeRedundantSoftXcnts(*MBB);

      Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);

      BI.Dirty = false;


      if (Brackets->hasPendingEvent()) {

        BlockInfo *MoveBracketsToSucc = nullptr;

        for (MachineBasicBlock *Succ : MBB->successors()) {

          auto *SuccBII = BlockInfos.find(Succ);

          BlockInfo &SuccBI = SuccBII->second;

          if (!SuccBI.Incoming) {

            SuccBI.Dirty = true;

            if (SuccBII <= BII) {

              LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");

              Repeat = true;

            }

            if (!MoveBracketsToSucc) {

              MoveBracketsToSucc = &SuccBI;

            } else {

              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);

            }

          } else {

            LLVM_DEBUG({

              dbgs() << "Try to merge ";

              MBB->printName(dbgs());

              dbgs() << " into ";

              Succ->printName(dbgs());

              dbgs() << '\n';

            });

            if (SuccBI.Incoming->merge(*Brackets)) {

              SuccBI.Dirty = true;

              if (SuccBII <= BII) {

                LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");

                Repeat = true;

              }

            }

          }

        }

        if (MoveBracketsToSucc)

          MoveBracketsToSucc->Incoming = std::move(Brackets);

      }

    }

  } while (Repeat);


  if (ST.hasScalarStores()) {

    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;

    bool HaveScalarStores = false;


    for (MachineBasicBlock &MBB : MF) {

      for (MachineInstr &MI : MBB) {

        if (!HaveScalarStores && TII.isScalarStore(MI))

          HaveScalarStores = true;


        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||

            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)

          EndPgmBlocks.push_back(&MBB);

      }

    }


    if (HaveScalarStores) {

      // If scalar writes are used, the cache must be flushed or else the next

      // wave to reuse the same scratch memory can be clobbered.

      //

      // Insert s_dcache_wb at wave termination points if there were any scalar

      // stores, and only if the cache hasn't already been flushed. This could

      // be improved by looking across blocks for flushes in postdominating

      // blocks from the stores but an explicitly requested flush is probably

      // very rare.

      for (MachineBasicBlock *MBB : EndPgmBlocks) {

        bool SeenDCacheWB = false;


        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();

             I != E; ++I) {

          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)

            SeenDCacheWB = true;

          else if (TII.isScalarStore(*I))

            SeenDCacheWB = false;


          // FIXME: It would be better to insert this before a waitcnt if any.

          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||

               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&

              !SeenDCacheWB) {

            Modified = true;

            BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));

          }

        }

      }

    }

  }


  if (IsExpertMode) {

    // Enable expert scheduling on function entry. To satisfy ABI requirements

    // and to allow calls between function with different expert scheduling

    // settings, disable it around calls and before returns.


    MachineBasicBlock::iterator I = EntryBB.begin();

    while (I != EntryBB.end() && I->isMetaInstruction())

      ++I;

    setSchedulingMode(EntryBB, I, true);


    for (MachineInstr *MI : CallInsts) {

      MachineBasicBlock &MBB = *MI->getParent();

      setSchedulingMode(MBB, MI, false);

      setSchedulingMode(MBB, std::next(MI->getIterator()), true);

    }


    for (MachineInstr *MI : ReturnInsts)

      setSchedulingMode(*MI->getParent(), MI, false);


    Modified = true;

  }


  // Deallocate the VGPRs before previously identified S_ENDPGM instructions.

  // This is done in different ways depending on how the VGPRs were allocated

  // (i.e. whether we're in dynamic VGPR mode or not).

  // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short

  // waveslot limited kernel runs slower with the deallocation.

  if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {

    for (auto [MI, _] : EndPgmInsts) {

      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

              TII.get(AMDGPU::S_ALLOC_VGPR))

          .addImm(0);

      Modified = true;

    }

  } else if (!WCG->isOptNone() &&

             ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&

             (MF.getFrameInfo().hasCalls() ||

              ST.getOccupancyWithNumVGPRs(

                  TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),

                  /*IsDynamicVGPR=*/false) <

                  AMDGPU::IsaInfo::getMaxWavesPerEU(&ST))) {

    for (auto [MI, Flag] : EndPgmInsts) {

      if (Flag) {

        if (ST.requiresNopBeforeDeallocVGPRs()) {

          BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

                  TII.get(AMDGPU::S_NOP))

              .addImm(0);

        }

        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

                TII.get(AMDGPU::S_SENDMSG))

            .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);

        Modified = true;

      }

    }

  }


  return Modified;

}

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUBaseInfo.h

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AliasAnalysis.h

print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition ArchiveWriter.cpp:205

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661

DebugCounter.h
This file provides an implementation of debug counters.

DEBUG_COUNTER
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition DebugCounter.h:181

Dominators.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

isOptNone
static bool isOptNone(const MachineFunction &MF)
Definition HexagonFrameLowering.cpp:371

_
#define _
Definition HexagonMCCodeEmitter.cpp:46

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InlinePriorityMode::ML
@ ML
Definition InlineOrder.cpp:25

LoopDeletionResult::Modified
@ Modified
Definition LoopDeletion.cpp:47

merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition LoopDeletion.cpp:51

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineFrameInfo.h

MachineLoopInfo.h

MachinePassManager.h

MachinePostDominators.h

Reg
Register Reg
Definition MachineSink.cpp:2119

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2120

MapVector.h
This file implements a map that provides insertion order iteration.

Context
@ Context
Definition MemProfContextDisambiguation.cpp:135

T
#define T
Definition Mips16ISelLowering.cpp:282

isReg
static bool isReg(const MCInst &MI, unsigned OpNo)
Definition MipsInstPrinter.cpp:32

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

ForceEmitZeroLoadFlag
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)

AMDGPU_EVENT_NAME
#define AMDGPU_EVENT_NAME(Name)
Definition SIInsertWaitcnts.cpp:189

updateOperandIfDifferent
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
Definition SIInsertWaitcnts.cpp:1695

isWaitInstr
static bool isWaitInstr(MachineInstr &Inst)
Definition SIInsertWaitcnts.cpp:3101

counterTypeForInstr
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Definition SIInsertWaitcnts.cpp:1711

ExpertSchedulingModeFlag
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)

ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)

AMDGPU_DECLARE_WAIT_EVENTS
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
Definition SIInsertWaitcnts.cpp:137

AMDGPU_EVENT_ENUM
#define AMDGPU_EVENT_ENUM(Name)
Definition SIInsertWaitcnts.cpp:166

SIMachineFunctionInfo.h

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487

Sequence.h
Provides some synthesis utilities to produce sequences of values.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

TargetParser.h

getFunction
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Definition WebAssemblyLowerEmscriptenEHSjLj.cpp:442

IV
static const uint32_t IV[8]
Definition blake3_impl.h:83

llvm::AAManager
A manager for alias analyses.
Definition AliasAnalysis.h:974

llvm::AMDGPUMachineFunctionInfo::isEntryFunction
bool isEntryFunction() const
Definition AMDGPUMachineFunctionInfo.h:89

llvm::AMDGPU::Waitcnt
Represents the counter values to wait for in an s_waitcnt instruction.
Definition AMDGPUBaseInfo.h:1128

llvm::AMDGPU::Waitcnt::get
unsigned get(InstCounterType T) const
Definition AMDGPUBaseInfo.h:1141

llvm::AMDGPU::Waitcnt::set
void set(InstCounterType T, unsigned Val)
Definition AMDGPUBaseInfo.h:1167

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:411

llvm::AnalysisUsage::addUsedIfAvailable
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Definition PassAnalysisSupport.h:118

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition PassAnalysisSupport.h:99

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137

llvm::Attribute::getValueAsBool
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition Attributes.cpp:391

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::DebugCounter::shouldExecute
static bool shouldExecute(CounterInfo &Counter)
Definition DebugCounter.h:107

llvm::DebugCounter::isCounterSet
static bool isCounterSet(CounterInfo &Info)
Definition DebugCounter.h:116

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256

llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition DenseMap.h:330

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241

llvm::DominatorTreeBase::dominates
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Definition GenericDomTree.h:493

llvm::FunctionAnalysisManagerMachineFunctionProxy
Definition MachinePassManager.h:130

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763

llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition GenericLoopInfoImpl.h:210

llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition GenericLoopInfo.h:606

llvm::MachineBasicBlock::getSingleSuccessor
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition MachineBasicBlock.cpp:996

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:382

llvm::MachineBasicBlock::findDebugLoc
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Definition MachineBasicBlock.cpp:1561

llvm::MachineBasicBlock::instr_iterator
Instructions::iterator instr_iterator
Definition MachineBasicBlock.h:341

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:384

llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition MachineBasicBlock.h:471

llvm::MachineBasicBlock::printName
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
Definition MachineBasicBlock.cpp:490

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:346

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachineFunctionPass.cpp:188

llvm::MachineFunction
Definition MachineFunction.h:295

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:755

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:896

llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition MachineFunction.h:1036

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:199

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:233

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineInstr::defs
mop_range defs()
Returns all explicit operands that are register definitions.
Definition MachineInstr.h:737

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:600

llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition MachineInstr.h:1180

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:372

llvm::MachineInstr::all_defs
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition MachineInstr.h:767

llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition MachineInstr.h:969

llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition MachineInstr.h:1157

llvm::MachineInstr::operands
mop_range operands()
Definition MachineInstr.h:706

llvm::MachineInstr::setDesc
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition MachineInstr.cpp:145

llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition MachineInstr.h:793

llvm::MachineInstr::print
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition MachineInstr.cpp:1803

llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition MachineInstr.h:1170

llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition MachineInstr.h:524

llvm::MachineInstr::all_uses
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
Definition MachineInstr.h:777

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:608

llvm::MachineInstr::eraseFromParent
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition MachineInstr.cpp:800

llvm::MachineInstr::isMetaInstruction
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Definition MachineInstr.h:955

llvm::MachineLoopAnalysis
Analysis pass that exposes the MachineLoopInfo for a machine function.
Definition MachineLoopInfo.h:140

llvm::MachineLoopInfoWrapperPass
Definition MachineLoopInfo.h:161

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:694

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:331

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachinePostDominatorTreeAnalysis
Definition MachinePostDominators.h:71

llvm::MachinePostDominatorTreeWrapperPass
Definition MachinePostDominators.h:95

llvm::MapVector::end
iterator end()
Definition MapVector.h:67

llvm::MapVector::find
iterator find(const KeyT &Key)
Definition MapVector.h:154

llvm::MapVector::begin
iterator begin()
Definition MapVector.h:65

llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116

llvm::Pass::print
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107

llvm::SIInsertWaitcntsPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SIInsertWaitcnts.cpp:3609

llvm::SIInstrInfo::isCBranchVCCZRead
static bool isCBranchVCCZRead(const MachineInstr &MI)
Definition SIInstrInfo.h:841

llvm::SIInstrInfo::isDS
static bool isDS(const MachineInstr &MI)
Definition SIInstrInfo.h:616

llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition SIInstrInfo.h:489

llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition SIInstrInfo.h:700

llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition SIInstrInfo.h:785

llvm::SIInstrInfo::mayWriteLDSThroughDMA
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition SIInstrInfo.h:827

llvm::SIInstrInfo::isLDSDIR
static bool isLDSDIR(const MachineInstr &MI)
Definition SIInstrInfo.h:1002

llvm::SIInstrInfo::isGWS
static bool isGWS(const MachineInstr &MI)
Definition SIInstrInfo.h:634

llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition SIInstrInfo.h:692

llvm::SIInstrInfo::isVSAMPLE
static bool isVSAMPLE(const MachineInstr &MI)
Definition SIInstrInfo.h:660

llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition SIInstrInfo.h:809

llvm::SIInstrInfo::isImage
static bool isImage(const MachineInstr &MI)
Definition SIInstrInfo.h:481

llvm::SIInstrInfo::getNonSoftWaitcntOpcode
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition SIInstrInfo.h:1160

llvm::SIInstrInfo::isVINTERP
static bool isVINTERP(const MachineInstr &MI)
Definition SIInstrInfo.h:1010

llvm::SIInstrInfo::isGFX12CacheInvOrWBInst
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
Definition SIInstrInfo.h:1126

llvm::SIInstrInfo::isSBarrierSCCWrite
static bool isSBarrierSCCWrite(unsigned Opcode)
Definition SIInstrInfo.h:835

llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition SIInstrInfo.h:644

llvm::SIInstrInfo::usesASYNC_CNT
static bool usesASYNC_CNT(const MachineInstr &MI)
Definition SIInstrInfo.h:1030

llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition SIInstrInfo.h:676

llvm::SIInstrInfo::isLDSDMA
static bool isLDSDMA(const MachineInstr &MI)
Definition SIInstrInfo.h:624

llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition SIInstrInfo.h:801

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:415

llvm::SIMachineFunctionInfo::isDynamicVGPREnabled
bool isDynamicVGPREnabled() const
Definition SIMachineFunctionInfo.h:850

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:419

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:80

llvm::StringLiteral
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882

llvm::cl::opt
Definition CommandLine.h:1454

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

uint32_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

OpName
Definition R600Defines.h:62

false
Definition MachinePipeliner.cpp:245

llvm::AA
Abstract Attribute helper functions.
Definition Attributor.h:165

llvm::AArch64CC::LS
@ LS
Definition AArch64BaseInfo.h:298

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPU::DepCtr::encodeFieldVaVdst
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
Definition AMDGPUBaseInfo.cpp:2168

llvm::AMDGPU::DepCtr::encodeFieldVmVsrc
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
Definition AMDGPUBaseInfo.cpp:2159

llvm::AMDGPU::DepCtr::decodeFieldVaVdst
unsigned decodeFieldVaVdst(unsigned Encoded)
Definition AMDGPUBaseInfo.cpp:2134

llvm::AMDGPU::DepCtr::getDefaultDepCtrEncoding
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2089

llvm::AMDGPU::DepCtr::decodeFieldVmVsrc
unsigned decodeFieldVmVsrc(unsigned Encoded)
Definition AMDGPUBaseInfo.cpp:2130

llvm::AMDGPU::Exp::Target
Target
Definition SIDefines.h:1023

llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition SIDefines.h:1036

llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition SIDefines.h:1028

llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition SIDefines.h:1031

llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition SIDefines.h:1035

llvm::AMDGPU::Hwreg::ID_SCHED_MODE
@ ID_SCHED_MODE
Definition SIDefines.h:537

llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition AMDGPUBaseInfo.cpp:1253

llvm::AMDGPU::SendMsg::ID_MASK_PreGFX11_
@ ID_MASK_PreGFX11_
Definition SIDefines.h:469

llvm::AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus
@ ID_DEALLOC_VGPRS_GFX11Plus
Definition SIDefines.h:445

llvm::AMDGPU::SendMsg::ID_GS_DONE_PreGFX11
@ ID_GS_DONE_PreGFX11
Definition SIDefines.h:442

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition AMDGPUBaseInfo.cpp:1873

llvm::AMDGPU::isDPMACCInstruction
bool isDPMACCInstruction(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:814

llvm::AMDGPU::inst_counter_types
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
Definition AMDGPUBaseInfo.cpp:197

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:147

llvm::AMDGPU::getIsaVersion
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Definition TargetParser.cpp:158

llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition AMDGPUBaseInfo.cpp:1909

llvm::AMDGPU::InstCounterType
InstCounterType
Definition AMDGPUBaseInfo.h:1093

llvm::AMDGPU::NUM_NORMAL_INST_CNTS
@ NUM_NORMAL_INST_CNTS
Definition AMDGPUBaseInfo.h:1098

llvm::AMDGPU::KM_CNT
@ KM_CNT
Definition AMDGPUBaseInfo.h:1101

llvm::AMDGPU::DS_CNT
@ DS_CNT
Definition AMDGPUBaseInfo.h:1095

llvm::AMDGPU::LOAD_CNT
@ LOAD_CNT
Definition AMDGPUBaseInfo.h:1094

llvm::AMDGPU::SAMPLE_CNT
@ SAMPLE_CNT
Definition AMDGPUBaseInfo.h:1099

llvm::AMDGPU::X_CNT
@ X_CNT
Definition AMDGPUBaseInfo.h:1102

llvm::AMDGPU::NUM_INST_CNTS
@ NUM_INST_CNTS
Definition AMDGPUBaseInfo.h:1107

llvm::AMDGPU::BVH_CNT
@ BVH_CNT
Definition AMDGPUBaseInfo.h:1100

llvm::AMDGPU::NUM_EXTENDED_INST_CNTS
@ NUM_EXTENDED_INST_CNTS
Definition AMDGPUBaseInfo.h:1103

llvm::AMDGPU::VA_VDST
@ VA_VDST
Definition AMDGPUBaseInfo.h:1104

llvm::AMDGPU::NUM_EXPERT_INST_CNTS
@ NUM_EXPERT_INST_CNTS
Definition AMDGPUBaseInfo.h:1106

llvm::AMDGPU::VM_VSRC
@ VM_VSRC
Definition AMDGPUBaseInfo.h:1105

llvm::AMDGPU::EXP_CNT
@ EXP_CNT
Definition AMDGPUBaseInfo.h:1096

llvm::AMDGPU::STORE_CNT
@ STORE_CNT
Definition AMDGPUBaseInfo.h:1097

llvm::AMDGPU::decodeStorecntDscnt
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Definition AMDGPUBaseInfo.cpp:1947

llvm::AMDGPU::decodeLoadcntDscnt
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
Definition AMDGPUBaseInfo.cpp:1937

llvm::AMDGPU::encodeStorecntDscnt
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
Definition AMDGPUBaseInfo.cpp:1988

llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:539

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::encodeLoadcntDscnt
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
Definition AMDGPUBaseInfo.cpp:1975

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::u
@ u
Definition M68kBaseInfo.h:60

llvm::RISCVFenceField::W
@ W
Definition RISCVBaseInfo.h:486

llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition SIDefines.h:122

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:494

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition PointerTypeAnalysis.cpp:211

llvm::lsp::SymbolKind::Event
@ Event
Definition Protocol.h:615

llvm::lsp::MessageType::Info
@ Info
Definition Protocol.h:1288

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::pdb::PDB_LocType::Slot
@ Slot
Definition PDBTypes.h:300

llvm::sandboxir::empty
bool empty() const
Definition BasicBlock.h:101

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm::sys::fs::remove
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition SparseBitVector.h:874

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::seq_inclusive
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739

llvm::PseudoProbeType::Block
@ Block
Definition PseudoProbe.h:30

llvm::print
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
Definition GCNRegPressure.cpp:245

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:449

llvm::operator&
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2138

llvm::enum_seq
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337

llvm::Wait
@ Wait
Definition Threading.h:60

llvm::getCPU
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Definition AVRTargetMachine.cpp:32

llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2128

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::interleaveComma
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::equal_to
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition AddressRanges.h:151

llvm::getMachineFunctionPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition MachinePassManager.cpp:162

llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition SIInsertWaitcnts.cpp:1689

llvm::UWTableKind::Async
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636

llvm::None
@ None
Definition CodeGenData.h:107

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1131

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:904

llvm::iterator_range
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::operator&=
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
Definition SparseBitVector.h:832

llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:23

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::operator|=
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
Definition SparseBitVector.h:820

llvm::operator|
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2158

llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition SIInsertWaitcnts.cpp:1691

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::reportFatalUsageError
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177

N
#define N

llvm::AMDGPU::EncodingField< 10, 6 >::Default
static constexpr ValueType Default
Definition AMDGPUBaseInfo.h:402

llvm::AMDGPU::EncodingFields< HwregId, HwregOffset, HwregSize >::encode
static constexpr uint64_t encode(Fields... Values)
Definition AMDGPUBaseInfo.h:417

llvm::AMDGPU::HardwareLimits
Represents the hardware counter limits for different wait count types.
Definition AMDGPUBaseInfo.h:1244

llvm::AMDGPU::HardwareLimits::ExpcntMax
unsigned ExpcntMax
Definition AMDGPUBaseInfo.h:1246

llvm::AMDGPU::HardwareLimits::VmVsrcMax
unsigned VmVsrcMax
Definition AMDGPUBaseInfo.h:1254

llvm::AMDGPU::HardwareLimits::SamplecntMax
unsigned SamplecntMax
Definition AMDGPUBaseInfo.h:1249

llvm::AMDGPU::HardwareLimits::BvhcntMax
unsigned BvhcntMax
Definition AMDGPUBaseInfo.h:1250

llvm::AMDGPU::HardwareLimits::LoadcntMax
unsigned LoadcntMax
Definition AMDGPUBaseInfo.h:1245

llvm::AMDGPU::HardwareLimits::XcntMax
unsigned XcntMax
Definition AMDGPUBaseInfo.h:1252

llvm::AMDGPU::HardwareLimits::KmcntMax
unsigned KmcntMax
Definition AMDGPUBaseInfo.h:1251

llvm::AMDGPU::HardwareLimits::DscntMax
unsigned DscntMax
Definition AMDGPUBaseInfo.h:1247

llvm::AMDGPU::HardwareLimits::VaVdstMax
unsigned VaVdstMax
Definition AMDGPUBaseInfo.h:1253

llvm::AMDGPU::HardwareLimits::StorecntMax
unsigned StorecntMax
Definition AMDGPUBaseInfo.h:1248

llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition TargetParser.h:51

llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition AMDGPUBaseInfo.h:435

llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition AMDGPUBaseInfo.h:440

llvm::AMDGPU::MIMGBaseOpcodeInfo::MSAA
bool MSAA
Definition AMDGPUBaseInfo.h:449

llvm::AMDGPU::MIMGBaseOpcodeInfo::PointSampleAccel
bool PointSampleAccel
Definition AMDGPUBaseInfo.h:453

llvm::AMDGPU::MIMGBaseOpcodeInfo::BVH
bool BVH
Definition AMDGPUBaseInfo.h:450

llvm::AMDGPU::MIMGInfo
Definition AMDGPUBaseInfo.h:538

llvm::cl::desc
Definition CommandLine.h:410

llvm::enum_iteration_traits< WaitEventType >::is_iterable
static constexpr bool is_iterable
Definition SIInsertWaitcnts.cpp:176

llvm::enum_iteration_traits
Definition Sequence.h:100