doxygen/SIWholeQuadMode_8cpp_source.html

//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// This pass adds instructions to enable whole quad mode (strict or non-strict)

/// for pixel shaders, and strict whole wavefront mode for all programs.

///

/// The "strict" prefix indicates that inactive lanes do not take part in

/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will

/// always be enabled irrespective of control flow decisions. Conversely in

/// non-strict WQM inactive lanes may control flow decisions.

///

/// Whole quad mode is required for derivative computations, but it interferes

/// with shader side effects (stores and atomics). It ensures that WQM is

/// enabled when necessary, but disabled around stores and atomics.

///

/// When necessary, this pass creates a function prolog

///

///   S_MOV_B64 LiveMask, EXEC

///   S_WQM_B64 EXEC, EXEC

///

/// to enter WQM at the top of the function and surrounds blocks of Exact

/// instructions by

///

///   S_AND_SAVEEXEC_B64 Tmp, LiveMask

///   ...

///   S_MOV_B64 EXEC, Tmp

///

/// We also compute when a sequence of instructions requires strict whole

/// wavefront mode (StrictWWM) and insert instructions to save and restore it:

///

///   S_OR_SAVEEXEC_B64 Tmp, -1

///   ...

///   S_MOV_B64 EXEC, Tmp

///

/// When a sequence of instructions requires strict whole quad mode (StrictWQM)

/// we use a similar save and restore mechanism and force whole quad mode for

/// those instructions:

///

///  S_MOV_B64 Tmp, EXEC

///  S_WQM_B64 EXEC, EXEC

///  ...

///  S_MOV_B64 EXEC, Tmp

///

/// In order to avoid excessive switching during sequences of Exact

/// instructions, the pass first analyzes which instructions must be run in WQM

/// (aka which instructions produce values that lead to derivative

/// computations).

///

/// Basic blocks are always exited in WQM as long as some successor needs WQM.

///

/// There is room for improvement given better control flow analysis:

///

///  (1) at the top level (outside of control flow statements, and as long as

///      kill hasn't been used), one SGPR can be saved by recovering WQM from

///      the LiveMask (this is implemented for the entry block).

///

///  (2) when entire regions (e.g. if-else blocks or entire loops) only

///      consist of exact and don't-care instructions, the switch only has to

///      be done at the entry and exit points rather than potentially in each

///      block of the region.

///

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/CodeGen/LiveIntervals.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineDominators.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/MachineInstr.h"

#include "llvm/CodeGen/MachinePostDominators.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/InitializePasses.h"

#include "llvm/Support/raw_ostream.h"


using namespace llvm;


#define DEBUG_TYPE "si-wqm"


namespace {


enum {

  StateWQM = 0x1,

  StateStrictWWM = 0x2,

  StateStrictWQM = 0x4,

  StateExact = 0x8,

  StateStrict = StateStrictWWM | StateStrictWQM,

};


struct PrintState {

public:

  int State;


  explicit PrintState(int State) : State(State) {}

};


#ifndef NDEBUG

static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {


  static const std::pair<char, const char *> Mapping[] = {

      std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),

      std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};

  char State = PS.State;

  for (auto M : Mapping) {

    if (State & M.first) {

      OS << M.second;

      State &= ~M.first;


      if (State)

        OS << '|';

    }

  }

  assert(State == 0);

  return OS;

}

#endif


struct InstrInfo {

  char Needs = 0;

  char Disabled = 0;

  char OutNeeds = 0;

};


struct BlockInfo {

  char Needs = 0;

  char InNeeds = 0;

  char OutNeeds = 0;

  char InitialState = 0;

  bool NeedsLowering = false;

};


struct WorkItem {

  MachineBasicBlock *MBB = nullptr;

  MachineInstr *MI = nullptr;


  WorkItem() = default;

  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}

  WorkItem(MachineInstr *MI) : MI(MI) {}

};


class SIWholeQuadMode : public MachineFunctionPass {

private:

  const SIInstrInfo *TII;

  const SIRegisterInfo *TRI;

  const GCNSubtarget *ST;

  MachineRegisterInfo *MRI;

  LiveIntervals *LIS;

  MachineDominatorTree *MDT;

  MachinePostDominatorTree *PDT;


  unsigned AndOpc;

  unsigned AndTermOpc;

  unsigned AndN2Opc;

  unsigned XorOpc;

  unsigned AndSaveExecOpc;

  unsigned AndSaveExecTermOpc;

  unsigned WQMOpc;

  Register Exec;

  Register LiveMaskReg;


  DenseMap<const MachineInstr *, InstrInfo> Instructions;

  MapVector<MachineBasicBlock *, BlockInfo> Blocks;


  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction

  DenseMap<const MachineInstr *, char> StateTransition;


  SmallVector<MachineInstr *, 2> LiveMaskQueries;

  SmallVector<MachineInstr *, 4> LowerToMovInstrs;

  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;

  SmallVector<MachineInstr *, 4> KillInstrs;


  void printInfo();


  void markInstruction(MachineInstr &MI, char Flag,

                       std::vector<WorkItem> &Worklist);

  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,

                unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);

  void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,

                   std::vector<WorkItem> &Worklist);

  void markInstructionUses(const MachineInstr &MI, char Flag,

                           std::vector<WorkItem> &Worklist);

  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);

  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);

  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);

  char analyzeFunction(MachineFunction &MF);


  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,

                                      MachineBasicBlock::iterator Before);

  MachineBasicBlock::iterator

  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,

                   MachineBasicBlock::iterator Last, bool PreferLast,

                   bool SaveSCC);

  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,

               Register SaveWQM);

  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,

             Register SavedWQM);

  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,

                    Register SaveOrig, char StrictStateNeeded);

  void fromStrictMode(MachineBasicBlock &MBB,

                      MachineBasicBlock::iterator Before, Register SavedOrig,

                      char NonStrictState, char CurrentStrictState);


  MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);


  MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,

                            bool IsWQM);

  MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);

  void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,

                             MachineInstr *Exit);


  void lowerBlock(MachineBasicBlock &MBB);

  void processBlock(MachineBasicBlock &MBB, bool IsEntry);


  void lowerLiveMaskQueries();

  void lowerCopyInstrs();

  void lowerKillInstrs(bool IsWQM);


public:

  static char ID;


  SIWholeQuadMode() :

    MachineFunctionPass(ID) { }


  bool runOnMachineFunction(MachineFunction &MF) override;


  StringRef getPassName() const override { return "SI Whole Quad Mode"; }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<LiveIntervals>();

    AU.addPreserved<SlotIndexes>();

    AU.addPreserved<LiveIntervals>();

    AU.addPreserved<MachineDominatorTree>();

    AU.addPreserved<MachinePostDominatorTree>();

    MachineFunctionPass::getAnalysisUsage(AU);

  }


  MachineFunctionProperties getClearedProperties() const override {

    return MachineFunctionProperties().set(

        MachineFunctionProperties::Property::IsSSA);

  }

};


} // end anonymous namespace


char SIWholeQuadMode::ID = 0;


INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,

                      false)

INITIALIZE_PASS_DEPENDENCY(LiveIntervals)

INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)

INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)

INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,

                    false)


char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;


FunctionPass *llvm::createSIWholeQuadModePass() {

  return new SIWholeQuadMode;

}


#ifndef NDEBUG

LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {

  for (const auto &BII : Blocks) {

    dbgs() << "\n"

           << printMBBReference(*BII.first) << ":\n"

           << "  InNeeds = " << PrintState(BII.second.InNeeds)

           << ", Needs = " << PrintState(BII.second.Needs)

           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";


    for (const MachineInstr &MI : *BII.first) {

      auto III = Instructions.find(&MI);

      if (III == Instructions.end())

        continue;


      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)

             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';

    }

  }

}

#endif


void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,

                                      std::vector<WorkItem> &Worklist) {

  InstrInfo &II = Instructions[&MI];


  assert(!(Flag & StateExact) && Flag != 0);


  // Remove any disabled states from the flag. The user that required it gets

  // an undefined value in the helper lanes. For example, this can happen if

  // the result of an atomic is used by instruction that requires WQM, where

  // ignoring the request for WQM is correct as per the relevant specs.

  Flag &= ~II.Disabled;


  // Ignore if the flag is already encompassed by the existing needs, or we

  // just disabled everything.

  if ((II.Needs & Flag) == Flag)

    return;


  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);

  II.Needs |= Flag;

  Worklist.push_back(&MI);

}


/// Mark all relevant definitions of register \p Reg in usage \p UseMI.

void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,

                               Register Reg, unsigned SubReg, char Flag,

                               std::vector<WorkItem> &Worklist) {

  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);


  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));

  const VNInfo *Value = UseLRQ.valueIn();

  if (!Value)

    return;


  // Note: this code assumes that lane masks on AMDGPU completely

  // cover registers.

  const LaneBitmask UseLanes =

      SubReg ? TRI->getSubRegIndexLaneMask(SubReg)

             : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)

                                : LaneBitmask::getNone());


  // Perform a depth-first iteration of the LiveRange graph marking defs.

  // Stop processing of a given branch when all use lanes have been defined.

  // The first definition stops processing for a physical register.

  struct PhiEntry {

    const VNInfo *Phi;

    unsigned PredIdx;

    LaneBitmask DefinedLanes;


    PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)

        : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}

  };

  using VisitKey = std::pair<const VNInfo *, LaneBitmask>;

  SmallVector<PhiEntry, 2> PhiStack;

  SmallSet<VisitKey, 4> Visited;

  LaneBitmask DefinedLanes;

  unsigned NextPredIdx = 0; // Only used for processing phi nodes

  do {

    const VNInfo *NextValue = nullptr;

    const VisitKey Key(Value, DefinedLanes);


    if (Visited.insert(Key).second) {

      // On first visit to a phi then start processing first predecessor

      NextPredIdx = 0;

    }


    if (Value->isPHIDef()) {

      // Each predecessor node in the phi must be processed as a subgraph

      const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);

      assert(MBB && "Phi-def has no defining MBB");


      // Find next predecessor to process

      unsigned Idx = NextPredIdx;

      auto PI = MBB->pred_begin() + Idx;

      auto PE = MBB->pred_end();

      for (; PI != PE && !NextValue; ++PI, ++Idx) {

        if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {

          if (!Visited.count(VisitKey(VN, DefinedLanes)))

            NextValue = VN;

        }

      }


      // If there are more predecessors to process; add phi to stack

      if (PI != PE)

        PhiStack.emplace_back(Value, Idx, DefinedLanes);

    } else {

      MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);

      assert(MI && "Def has no defining instruction");


      if (Reg.isVirtual()) {

        // Iterate over all operands to find relevant definitions

        bool HasDef = false;

        for (const MachineOperand &Op : MI->all_defs()) {

          if (Op.getReg() != Reg)

            continue;


          // Compute lanes defined and overlap with use

          LaneBitmask OpLanes =

              Op.isUndef() ? LaneBitmask::getAll()

                           : TRI->getSubRegIndexLaneMask(Op.getSubReg());

          LaneBitmask Overlap = (UseLanes & OpLanes);


          // Record if this instruction defined any of use

          HasDef |= Overlap.any();


          // Mark any lanes defined

          DefinedLanes |= OpLanes;

        }


        // Check if all lanes of use have been defined

        if ((DefinedLanes & UseLanes) != UseLanes) {

          // Definition not complete; need to process input value

          LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));

          if (const VNInfo *VN = LRQ.valueIn()) {

            if (!Visited.count(VisitKey(VN, DefinedLanes)))

              NextValue = VN;

          }

        }


        // Only mark the instruction if it defines some part of the use

        if (HasDef)

          markInstruction(*MI, Flag, Worklist);

      } else {

        // For physical registers simply mark the defining instruction

        markInstruction(*MI, Flag, Worklist);

      }

    }


    if (!NextValue && !PhiStack.empty()) {

      // Reach end of chain; revert to processing last phi

      PhiEntry &Entry = PhiStack.back();

      NextValue = Entry.Phi;

      NextPredIdx = Entry.PredIdx;

      DefinedLanes = Entry.DefinedLanes;

      PhiStack.pop_back();

    }


    Value = NextValue;

  } while (Value);

}


void SIWholeQuadMode::markOperand(const MachineInstr &MI,

                                  const MachineOperand &Op, char Flag,

                                  std::vector<WorkItem> &Worklist) {

  assert(Op.isReg());

  Register Reg = Op.getReg();


  // Ignore some hardware registers

  switch (Reg) {

  case AMDGPU::EXEC:

  case AMDGPU::EXEC_LO:

    return;

  default:

    break;

  }


  LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op

                    << " for " << MI);

  if (Reg.isVirtual()) {

    LiveRange &LR = LIS->getInterval(Reg);

    markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);

  } else {

    // Handle physical registers that we need to track; this is mostly relevant

    // for VCC, which can appear as the (implicit) input of a uniform branch,

    // e.g. when a loop counter is stored in a VGPR.

    for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {

      LiveRange &LR = LIS->getRegUnit(Unit);

      const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();

      if (!Value)

        continue;


      markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);

    }

  }

}


/// Mark all instructions defining the uses in \p MI with \p Flag.

void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,

                                          std::vector<WorkItem> &Worklist) {

  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "

                    << MI);


  for (const MachineOperand &Use : MI.all_uses())

    markOperand(MI, Use, Flag, Worklist);

}


// Scan instructions to determine which ones require an Exact execmask and

// which ones seed WQM requirements.

char SIWholeQuadMode::scanInstructions(MachineFunction &MF,

                                       std::vector<WorkItem> &Worklist) {

  char GlobalFlags = 0;

  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");

  SmallVector<MachineInstr *, 4> SetInactiveInstrs;

  SmallVector<MachineInstr *, 4> SoftWQMInstrs;

  bool HasImplicitDerivatives =

      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;


  // We need to visit the basic blocks in reverse post-order so that we visit

  // defs before uses, in particular so that we don't accidentally mark an

  // instruction as needing e.g. WQM before visiting it and realizing it needs

  // WQM disabled.

  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);

  for (MachineBasicBlock *MBB : RPOT) {

    BlockInfo &BBI = Blocks[MBB];


    for (MachineInstr &MI : *MBB) {

      InstrInfo &III = Instructions[&MI];

      unsigned Opcode = MI.getOpcode();

      char Flags = 0;


      if (TII->isWQM(Opcode)) {

        // If LOD is not supported WQM is not needed.

        if (!ST->hasExtendedImageInsts())

          continue;

        // Only generate implicit WQM if implicit derivatives are required.

        // This avoids inserting unintended WQM if a shader type without

        // implicit derivatives uses an image sampling instruction.

        if (!HasImplicitDerivatives)

          continue;

        // Sampling instructions don't need to produce results for all pixels

        // in a quad, they just require all inputs of a quad to have been

        // computed for derivatives.

        markInstructionUses(MI, StateWQM, Worklist);

        GlobalFlags |= StateWQM;

        continue;

      } else if (Opcode == AMDGPU::WQM) {

        // The WQM intrinsic requires its output to have all the helper lanes

        // correct, so we need it to be in WQM.

        Flags = StateWQM;

        LowerToCopyInstrs.push_back(&MI);

      } else if (Opcode == AMDGPU::SOFT_WQM) {

        LowerToCopyInstrs.push_back(&MI);

        SoftWQMInstrs.push_back(&MI);

        continue;

      } else if (Opcode == AMDGPU::STRICT_WWM) {

        // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus

        // it needs to be executed in WQM or Exact so that its copy doesn't

        // clobber inactive lanes.

        markInstructionUses(MI, StateStrictWWM, Worklist);

        GlobalFlags |= StateStrictWWM;

        LowerToMovInstrs.push_back(&MI);

        continue;

      } else if (Opcode == AMDGPU::STRICT_WQM ||

                 TII->isDualSourceBlendEXP(MI)) {

        // STRICT_WQM is similar to STRICTWWM, but instead of enabling all

        // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in

        // quads that have at least one active thread.

        markInstructionUses(MI, StateStrictWQM, Worklist);

        GlobalFlags |= StateStrictWQM;


        if (Opcode == AMDGPU::STRICT_WQM) {

          LowerToMovInstrs.push_back(&MI);

        } else {

          // Dual source blend export acts as implicit strict-wqm, its sources

          // need to be shuffled in strict wqm, but the export itself needs to

          // run in exact mode.

          BBI.Needs |= StateExact;

          if (!(BBI.InNeeds & StateExact)) {

            BBI.InNeeds |= StateExact;

            Worklist.push_back(MBB);

          }

          GlobalFlags |= StateExact;

          III.Disabled = StateWQM | StateStrict;

        }

        continue;

      } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||

                 Opcode == AMDGPU::DS_PARAM_LOAD ||

                 Opcode == AMDGPU::LDS_DIRECT_LOAD ||

                 Opcode == AMDGPU::DS_DIRECT_LOAD) {

        // Mark these STRICTWQM, but only for the instruction, not its operands.

        // This avoid unnecessarily marking M0 as requiring WQM.

        InstrInfo &II = Instructions[&MI];

        II.Needs |= StateStrictWQM;

        GlobalFlags |= StateStrictWQM;

        continue;

      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||

                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {

        III.Disabled = StateStrict;

        MachineOperand &Inactive = MI.getOperand(2);

        if (Inactive.isReg()) {

          if (Inactive.isUndef()) {

            LowerToCopyInstrs.push_back(&MI);

          } else {

            markOperand(MI, Inactive, StateStrictWWM, Worklist);

          }

        }

        SetInactiveInstrs.push_back(&MI);

        continue;

      } else if (TII->isDisableWQM(MI)) {

        BBI.Needs |= StateExact;

        if (!(BBI.InNeeds & StateExact)) {

          BBI.InNeeds |= StateExact;

          Worklist.push_back(MBB);

        }

        GlobalFlags |= StateExact;

        III.Disabled = StateWQM | StateStrict;

        continue;

      } else {

        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {

          LiveMaskQueries.push_back(&MI);

        } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||

                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||

                   Opcode == AMDGPU::SI_DEMOTE_I1) {

          KillInstrs.push_back(&MI);

          BBI.NeedsLowering = true;

        } else if (WQMOutputs) {

          // The function is in machine SSA form, which means that physical

          // VGPRs correspond to shader inputs and outputs. Inputs are

          // only used, outputs are only defined.

          // FIXME: is this still valid?

          for (const MachineOperand &MO : MI.defs()) {

            if (!MO.isReg())

              continue;


            Register Reg = MO.getReg();


            if (!Reg.isVirtual() &&

                TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {

              Flags = StateWQM;

              break;

            }

          }

        }


        if (!Flags)

          continue;

      }


      markInstruction(MI, Flags, Worklist);

      GlobalFlags |= Flags;

    }

  }


  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is

  // ever used anywhere in the function. This implements the corresponding

  // semantics of @llvm.amdgcn.set.inactive.

  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.

  if (GlobalFlags & StateWQM) {

    for (MachineInstr *MI : SetInactiveInstrs)

      markInstruction(*MI, StateWQM, Worklist);

    for (MachineInstr *MI : SoftWQMInstrs)

      markInstruction(*MI, StateWQM, Worklist);

  }


  return GlobalFlags;

}


void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,

                                           std::vector<WorkItem>& Worklist) {

  MachineBasicBlock *MBB = MI.getParent();

  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references

  BlockInfo &BI = Blocks[MBB];


  // Control flow-type instructions and stores to temporary memory that are

  // followed by WQM computations must themselves be in WQM.

  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&

      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {

    Instructions[&MI].Needs = StateWQM;

    II.Needs = StateWQM;

  }


  // Propagate to block level

  if (II.Needs & StateWQM) {

    BI.Needs |= StateWQM;

    if (!(BI.InNeeds & StateWQM)) {

      BI.InNeeds |= StateWQM;

      Worklist.push_back(MBB);

    }

  }


  // Propagate backwards within block

  if (MachineInstr *PrevMI = MI.getPrevNode()) {

    char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;

    if (!PrevMI->isPHI()) {

      InstrInfo &PrevII = Instructions[PrevMI];

      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {

        PrevII.OutNeeds |= InNeeds;

        Worklist.push_back(PrevMI);

      }

    }

  }


  // Propagate WQM flag to instruction inputs

  assert(!(II.Needs & StateExact));


  if (II.Needs != 0)

    markInstructionUses(MI, II.Needs, Worklist);


  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does

  // not require any WQM transitions.

  if (II.Needs & StateStrictWWM)

    BI.Needs |= StateStrictWWM;

  if (II.Needs & StateStrictWQM)

    BI.Needs |= StateStrictWQM;

}


void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,

                                     std::vector<WorkItem>& Worklist) {

  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.


  // Propagate through instructions

  if (!MBB.empty()) {

    MachineInstr *LastMI = &*MBB.rbegin();

    InstrInfo &LastII = Instructions[LastMI];

    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {

      LastII.OutNeeds |= BI.OutNeeds;

      Worklist.push_back(LastMI);

    }

  }


  // Predecessor blocks must provide for our WQM/Exact needs.

  for (MachineBasicBlock *Pred : MBB.predecessors()) {

    BlockInfo &PredBI = Blocks[Pred];

    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)

      continue;


    PredBI.OutNeeds |= BI.InNeeds;

    PredBI.InNeeds |= BI.InNeeds;

    Worklist.push_back(Pred);

  }


  // All successors must be prepared to accept the same set of WQM/Exact data.

  for (MachineBasicBlock *Succ : MBB.successors()) {

    BlockInfo &SuccBI = Blocks[Succ];

    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)

      continue;


    SuccBI.InNeeds |= BI.OutNeeds;

    Worklist.push_back(Succ);

  }

}


char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {

  std::vector<WorkItem> Worklist;

  char GlobalFlags = scanInstructions(MF, Worklist);


  while (!Worklist.empty()) {

    WorkItem WI = Worklist.back();

    Worklist.pop_back();


    if (WI.MI)

      propagateInstruction(*WI.MI, Worklist);

    else

      propagateBlock(*WI.MBB, Worklist);

  }


  return GlobalFlags;

}


MachineBasicBlock::iterator

SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,

                         MachineBasicBlock::iterator Before) {

  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);


  MachineInstr *Save =

      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)

          .addReg(AMDGPU::SCC);

  MachineInstr *Restore =

      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)

          .addReg(SaveReg);


  LIS->InsertMachineInstrInMaps(*Save);

  LIS->InsertMachineInstrInMaps(*Restore);

  LIS->createAndComputeVirtRegInterval(SaveReg);


  return Restore;

}


MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,

                                               MachineInstr *TermMI) {

  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "

                    << *TermMI << "\n");


  MachineBasicBlock *SplitBB =

      BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);


  // Convert last instruction in block to a terminator.

  // Note: this only covers the expected patterns

  unsigned NewOpcode = 0;

  switch (TermMI->getOpcode()) {

  case AMDGPU::S_AND_B32:

    NewOpcode = AMDGPU::S_AND_B32_term;

    break;

  case AMDGPU::S_AND_B64:

    NewOpcode = AMDGPU::S_AND_B64_term;

    break;

  case AMDGPU::S_MOV_B32:

    NewOpcode = AMDGPU::S_MOV_B32_term;

    break;

  case AMDGPU::S_MOV_B64:

    NewOpcode = AMDGPU::S_MOV_B64_term;

    break;

  default:

    break;

  }

  if (NewOpcode)

    TermMI->setDesc(TII->get(NewOpcode));


  if (SplitBB != BB) {

    // Update dominator trees

    using DomTreeT = DomTreeBase<MachineBasicBlock>;

    SmallVector<DomTreeT::UpdateType, 16> DTUpdates;

    for (MachineBasicBlock *Succ : SplitBB->successors()) {

      DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});

      DTUpdates.push_back({DomTreeT::Delete, BB, Succ});

    }

    DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});

    if (MDT)

      MDT->getBase().applyUpdates(DTUpdates);

    if (PDT)

      PDT->getBase().applyUpdates(DTUpdates);


    // Link blocks

    MachineInstr *MI =

        BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))

            .addMBB(SplitBB);

    LIS->InsertMachineInstrInMaps(*MI);

  }


  return SplitBB;

}


MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,

                                            MachineInstr &MI) {

  const DebugLoc &DL = MI.getDebugLoc();

  unsigned Opcode = 0;


  assert(MI.getOperand(0).isReg());


  // Comparison is for live lanes; however here we compute the inverse

  // (killed lanes).  This is because VCMP will always generate 0 bits

  // for inactive lanes so a mask of live lanes would not be correct

  // inside control flow.

  // Invert the comparison by swapping the operands and adjusting

  // the comparison codes.


  switch (MI.getOperand(2).getImm()) {

  case ISD::SETUEQ:

    Opcode = AMDGPU::V_CMP_LG_F32_e64;

    break;

  case ISD::SETUGT:

    Opcode = AMDGPU::V_CMP_GE_F32_e64;

    break;

  case ISD::SETUGE:

    Opcode = AMDGPU::V_CMP_GT_F32_e64;

    break;

  case ISD::SETULT:

    Opcode = AMDGPU::V_CMP_LE_F32_e64;

    break;

  case ISD::SETULE:

    Opcode = AMDGPU::V_CMP_LT_F32_e64;

    break;

  case ISD::SETUNE:

    Opcode = AMDGPU::V_CMP_EQ_F32_e64;

    break;

  case ISD::SETO:

    Opcode = AMDGPU::V_CMP_O_F32_e64;

    break;

  case ISD::SETUO:

    Opcode = AMDGPU::V_CMP_U_F32_e64;

    break;

  case ISD::SETOEQ:

  case ISD::SETEQ:

    Opcode = AMDGPU::V_CMP_NEQ_F32_e64;

    break;

  case ISD::SETOGT:

  case ISD::SETGT:

    Opcode = AMDGPU::V_CMP_NLT_F32_e64;

    break;

  case ISD::SETOGE:

  case ISD::SETGE:

    Opcode = AMDGPU::V_CMP_NLE_F32_e64;

    break;

  case ISD::SETOLT:

  case ISD::SETLT:

    Opcode = AMDGPU::V_CMP_NGT_F32_e64;

    break;

  case ISD::SETOLE:

  case ISD::SETLE:

    Opcode = AMDGPU::V_CMP_NGE_F32_e64;

    break;

  case ISD::SETONE:

  case ISD::SETNE:

    Opcode = AMDGPU::V_CMP_NLG_F32_e64;

    break;

  default:

    llvm_unreachable("invalid ISD:SET cond code");

  }


  // Pick opcode based on comparison type.

  MachineInstr *VcmpMI;

  const MachineOperand &Op0 = MI.getOperand(0);

  const MachineOperand &Op1 = MI.getOperand(1);


  // VCC represents lanes killed.

  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;


  if (TRI->isVGPR(*MRI, Op0.getReg())) {

    Opcode = AMDGPU::getVOPe32(Opcode);

    VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);

  } else {

    VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))

                 .addReg(VCC, RegState::Define)

                 .addImm(0) // src0 modifiers

                 .add(Op1)

                 .addImm(0) // src1 modifiers

                 .add(Op0)

                 .addImm(0); // omod

  }


  MachineInstr *MaskUpdateMI =

      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)

          .addReg(LiveMaskReg)

          .addReg(VCC);


  // State of SCC represents whether any lanes are live in mask,

  // if SCC is 0 then no lanes will be alive anymore.

  MachineInstr *EarlyTermMI =

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));


  MachineInstr *ExecMaskMI =

      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);


  assert(MBB.succ_size() == 1);

  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))

                              .addMBB(*MBB.succ_begin());


  // Update live intervals

  LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);

  MBB.remove(&MI);


  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);

  LIS->InsertMachineInstrInMaps(*ExecMaskMI);

  LIS->InsertMachineInstrInMaps(*EarlyTermMI);

  LIS->InsertMachineInstrInMaps(*NewTerm);


  return NewTerm;

}


MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,

                                           MachineInstr &MI, bool IsWQM) {

  const DebugLoc &DL = MI.getDebugLoc();

  MachineInstr *MaskUpdateMI = nullptr;


  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);

  const MachineOperand &Op = MI.getOperand(0);

  int64_t KillVal = MI.getOperand(1).getImm();

  MachineInstr *ComputeKilledMaskMI = nullptr;

  Register CndReg = !Op.isImm() ? Op.getReg() : Register();

  Register TmpReg;


  // Is this a static or dynamic kill?

  if (Op.isImm()) {

    if (Op.getImm() == KillVal) {

      // Static: all active lanes are killed

      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)

                         .addReg(LiveMaskReg)

                         .addReg(Exec);

    } else {

      // Static: kill does nothing

      MachineInstr *NewTerm = nullptr;

      if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {

        LIS->RemoveMachineInstrFromMaps(MI);

      } else {

        assert(MBB.succ_size() == 1);

        NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))

                      .addMBB(*MBB.succ_begin());

        LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);

      }

      MBB.remove(&MI);

      return NewTerm;

    }

  } else {

    if (!KillVal) {

      // Op represents live lanes after kill,

      // so exec mask needs to be factored in.

      TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());

      ComputeKilledMaskMI =

          BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);

      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)

                         .addReg(LiveMaskReg)

                         .addReg(TmpReg);

    } else {

      // Op represents lanes to kill

      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)

                         .addReg(LiveMaskReg)

                         .add(Op);

    }

  }


  // State of SCC represents whether any lanes are live in mask,

  // if SCC is 0 then no lanes will be alive anymore.

  MachineInstr *EarlyTermMI =

      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));


  // In the case we got this far some lanes are still live,

  // update EXEC to deactivate lanes as appropriate.

  MachineInstr *NewTerm;

  MachineInstr *WQMMaskMI = nullptr;

  Register LiveMaskWQM;

  if (IsDemote) {

    // Demote - deactivate quads with only helper lanes

    LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());

    WQMMaskMI =

        BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);

    NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)

                  .addReg(Exec)

                  .addReg(LiveMaskWQM);

  } else {

    // Kill - deactivate lanes no longer in live mask

    if (Op.isImm()) {

      unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;

      NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);

    } else if (!IsWQM) {

      NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)

                    .addReg(Exec)

                    .addReg(LiveMaskReg);

    } else {

      unsigned Opcode = KillVal ? AndN2Opc : AndOpc;

      NewTerm =

          BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);

    }

  }


  // Update live intervals

  LIS->RemoveMachineInstrFromMaps(MI);

  MBB.remove(&MI);

  assert(EarlyTermMI);

  assert(MaskUpdateMI);

  assert(NewTerm);

  if (ComputeKilledMaskMI)

    LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);

  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);

  LIS->InsertMachineInstrInMaps(*EarlyTermMI);

  if (WQMMaskMI)

    LIS->InsertMachineInstrInMaps(*WQMMaskMI);

  LIS->InsertMachineInstrInMaps(*NewTerm);


  if (CndReg) {

    LIS->removeInterval(CndReg);

    LIS->createAndComputeVirtRegInterval(CndReg);

  }

  if (TmpReg)

    LIS->createAndComputeVirtRegInterval(TmpReg);

  if (LiveMaskWQM)

    LIS->createAndComputeVirtRegInterval(LiveMaskWQM);


  return NewTerm;

}


// Convert a strict mode transition to a pseudo transition.

// This still pre-allocates registers to prevent clobbering,

// but avoids any EXEC mask changes.

void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,

                                            MachineInstr *Entry,

                                            MachineInstr *Exit) {

  assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);

  assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);


  Register SaveOrig = Entry->getOperand(0).getReg();


  MachineInstr *NewEntry =

    BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));

  MachineInstr *NewExit =

    BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));


  LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);

  Exit->eraseFromParent();


  LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);

  Entry->eraseFromParent();


  LIS->removeInterval(SaveOrig);

}


// Replace (or supplement) instructions accessing live mask.

// This can only happen once all the live mask registers have been created

// and the execute state (WQM/StrictWWM/Exact) of instructions is known.

void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {

  auto BII = Blocks.find(&MBB);

  if (BII == Blocks.end())

    return;


  const BlockInfo &BI = BII->second;

  if (!BI.NeedsLowering)

    return;


  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");


  SmallVector<MachineInstr *, 4> SplitPoints;

  char State = BI.InitialState;

  MachineInstr *StrictEntry = nullptr;


  for (MachineInstr &MI : llvm::make_early_inc_range(

           llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {

    char PreviousState = State;


    if (StateTransition.count(&MI))

      State = StateTransition[&MI];


    MachineInstr *SplitPoint = nullptr;

    switch (MI.getOpcode()) {

    case AMDGPU::SI_DEMOTE_I1:

    case AMDGPU::SI_KILL_I1_TERMINATOR:

      SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);

      break;

    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:

      SplitPoint = lowerKillF32(MBB, MI);

      break;

    case AMDGPU::ENTER_STRICT_WQM:

      StrictEntry = PreviousState == StateWQM ? &MI : nullptr;

      break;

    case AMDGPU::EXIT_STRICT_WQM:

      if (State == StateWQM && StrictEntry) {

        // Transition WQM -> StrictWQM -> WQM detected.

        lowerPseudoStrictMode(MBB, StrictEntry, &MI);

      }

      StrictEntry = nullptr;

      break;

    case AMDGPU::ENTER_STRICT_WWM:

    case AMDGPU::EXIT_STRICT_WWM:

      StrictEntry = nullptr;

      break;

    default:

      break;

    }

    if (SplitPoint)

      SplitPoints.push_back(SplitPoint);

  }


  // Perform splitting after instruction scan to simplify iteration.

  if (!SplitPoints.empty()) {

    MachineBasicBlock *BB = &MBB;

    for (MachineInstr *MI : SplitPoints) {

      BB = splitBlock(BB, MI);

    }

  }

}


// Return an iterator in the (inclusive) range [First, Last] at which

// instructions can be safely inserted, keeping in mind that some of the

// instructions we want to add necessarily clobber SCC.

MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(

    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,

    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {

  if (!SaveSCC)

    return PreferLast ? Last : First;


  LiveRange &LR =

      LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());

  auto MBBE = MBB.end();

  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)

                                     : LIS->getMBBEndIdx(&MBB);

  SlotIndex LastIdx =

      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);

  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;

  const LiveRange::Segment *S;


  for (;;) {

    S = LR.getSegmentContaining(Idx);

    if (!S)

      break;


    if (PreferLast) {

      SlotIndex Next = S->start.getBaseIndex();

      if (Next < FirstIdx)

        break;

      Idx = Next;

    } else {

      MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());

      assert(EndMI && "Segment does not end on valid instruction");

      auto NextI = std::next(EndMI->getIterator());

      if (NextI == MBB.end())

        break;

      SlotIndex Next = LIS->getInstructionIndex(*NextI);

      if (Next > LastIdx)

        break;

      Idx = Next;

    }

  }


  MachineBasicBlock::iterator MBBI;


  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))

    MBBI = MI;

  else {

    assert(Idx == LIS->getMBBEndIdx(&MBB));

    MBBI = MBB.end();

  }


  // Move insertion point past any operations modifying EXEC.

  // This assumes that the value of SCC defined by any of these operations

  // does not need to be preserved.

  while (MBBI != Last) {

    bool IsExecDef = false;

    for (const MachineOperand &MO : MBBI->all_defs()) {

      IsExecDef |=

          MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;

    }

    if (!IsExecDef)

      break;

    MBBI++;

    S = nullptr;

  }


  if (S)

    MBBI = saveSCC(MBB, MBBI);


  return MBBI;

}


void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,

                              MachineBasicBlock::iterator Before,

                              Register SaveWQM) {

  bool IsTerminator = Before == MBB.end();

  if (!IsTerminator) {

    auto FirstTerm = MBB.getFirstTerminator();

    if (FirstTerm != MBB.end()) {

      SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);

      SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);

      IsTerminator = BeforeIdx > FirstTermIdx;

    }

  }


  MachineInstr *MI;


  if (SaveWQM) {

    unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)

             .addReg(LiveMaskReg);

  } else {

    unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)

             .addReg(Exec)

             .addReg(LiveMaskReg);

  }


  LIS->InsertMachineInstrInMaps(*MI);

  StateTransition[MI] = StateExact;

}


void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,

                            MachineBasicBlock::iterator Before,

                            Register SavedWQM) {

  MachineInstr *MI;


  if (SavedWQM) {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)

             .addReg(SavedWQM);

  } else {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);

  }


  LIS->InsertMachineInstrInMaps(*MI);

  StateTransition[MI] = StateWQM;

}


void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,

                                   MachineBasicBlock::iterator Before,

                                   Register SaveOrig, char StrictStateNeeded) {

  MachineInstr *MI;

  assert(SaveOrig);

  assert(StrictStateNeeded == StateStrictWWM ||

         StrictStateNeeded == StateStrictWQM);


  if (StrictStateNeeded == StateStrictWWM) {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),

                 SaveOrig)

             .addImm(-1);

  } else {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),

                 SaveOrig)

             .addImm(-1);

  }

  LIS->InsertMachineInstrInMaps(*MI);

  StateTransition[MI] = StrictStateNeeded;


  // Mark block as needing lower so it will be checked for unnecessary transitions.

  auto BII = Blocks.find(&MBB);

  if (BII != Blocks.end())

    BII->second.NeedsLowering = true;

}


void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,

                                     MachineBasicBlock::iterator Before,

                                     Register SavedOrig, char NonStrictState,

                                     char CurrentStrictState) {

  MachineInstr *MI;


  assert(SavedOrig);

  assert(CurrentStrictState == StateStrictWWM ||

         CurrentStrictState == StateStrictWQM);


  if (CurrentStrictState == StateStrictWWM) {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),

                 Exec)

             .addReg(SavedOrig);

  } else {

    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),

                 Exec)

             .addReg(SavedOrig);

  }

  LIS->InsertMachineInstrInMaps(*MI);

  StateTransition[MI] = NonStrictState;

}


void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {

  auto BII = Blocks.find(&MBB);

  if (BII == Blocks.end())

    return;


  BlockInfo &BI = BII->second;


  // This is a non-entry block that is WQM throughout, so no need to do

  // anything.

  if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {

    BI.InitialState = StateWQM;

    return;

  }


  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)

                    << ":\n");


  Register SavedWQMReg;

  Register SavedNonStrictReg;

  bool WQMFromExec = IsEntry;

  char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;

  char NonStrictState = 0;

  const TargetRegisterClass *BoolRC = TRI->getBoolRC();


  auto II = MBB.getFirstNonPHI(), IE = MBB.end();

  if (IsEntry) {

    // Skip the instruction that saves LiveMask

    if (II != IE && II->getOpcode() == AMDGPU::COPY &&

        II->getOperand(1).getReg() == TRI->getExec())

      ++II;

  }


  // This stores the first instruction where it's safe to switch from WQM to

  // Exact or vice versa.

  MachineBasicBlock::iterator FirstWQM = IE;


  // This stores the first instruction where it's safe to switch from Strict

  // mode to Exact/WQM or to switch to Strict mode. It must always be the same

  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must

  // be safe to switch to/from WQM as well.

  MachineBasicBlock::iterator FirstStrict = IE;


  // Record initial state is block information.

  BI.InitialState = State;


  for (;;) {

    MachineBasicBlock::iterator Next = II;

    char Needs = StateExact | StateWQM; // Strict mode is disabled by default.

    char OutNeeds = 0;


    if (FirstWQM == IE)

      FirstWQM = II;


    if (FirstStrict == IE)

      FirstStrict = II;


    // First, figure out the allowed states (Needs) based on the propagated

    // flags.

    if (II != IE) {

      MachineInstr &MI = *II;


      if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {

        auto III = Instructions.find(&MI);

        if (III != Instructions.end()) {

          if (III->second.Needs & StateStrictWWM)

            Needs = StateStrictWWM;

          else if (III->second.Needs & StateStrictWQM)

            Needs = StateStrictWQM;

          else if (III->second.Needs & StateWQM)

            Needs = StateWQM;

          else

            Needs &= ~III->second.Disabled;

          OutNeeds = III->second.OutNeeds;

        }

      } else {

        // If the instruction doesn't actually need a correct EXEC, then we can

        // safely leave Strict mode enabled.

        Needs = StateExact | StateWQM | StateStrict;

      }


      // Exact mode exit can occur in terminators, but must be before branches.

      if (MI.isBranch() && OutNeeds == StateExact)

        Needs = StateExact;


      ++Next;

    } else {

      // End of basic block

      if (BI.OutNeeds & StateWQM)

        Needs = StateWQM;

      else if (BI.OutNeeds == StateExact)

        Needs = StateExact;

      else

        Needs = StateWQM | StateExact;

    }


    // Now, transition if necessary.

    if (!(Needs & State)) {

      MachineBasicBlock::iterator First;

      if (State == StateStrictWWM || Needs == StateStrictWWM ||

          State == StateStrictWQM || Needs == StateStrictWQM) {

        // We must switch to or from Strict mode.

        First = FirstStrict;

      } else {

        // We only need to switch to/from WQM, so we can use FirstWQM.

        First = FirstWQM;

      }


      // Whether we need to save SCC depends on start and end states.

      bool SaveSCC = false;

      switch (State) {

      case StateExact:

      case StateStrictWWM:

      case StateStrictWQM:

        // Exact/Strict -> Strict: save SCC

        // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec

        // Exact/Strict -> Exact: no save

        SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);

        break;

      case StateWQM:

        // WQM -> Exact/Strict: save SCC

        SaveSCC = !(Needs & StateWQM);

        break;

      default:

        llvm_unreachable("Unknown state");

        break;

      }

      MachineBasicBlock::iterator Before =

          prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);


      if (State & StateStrict) {

        assert(State == StateStrictWWM || State == StateStrictWQM);

        assert(SavedNonStrictReg);

        fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);


        LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);

        SavedNonStrictReg = 0;

        State = NonStrictState;

      }


      if (Needs & StateStrict) {

        NonStrictState = State;

        assert(Needs == StateStrictWWM || Needs == StateStrictWQM);

        assert(!SavedNonStrictReg);

        SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);


        toStrictMode(MBB, Before, SavedNonStrictReg, Needs);

        State = Needs;


      } else {

        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {

          if (!WQMFromExec && (OutNeeds & StateWQM)) {

            assert(!SavedWQMReg);

            SavedWQMReg = MRI->createVirtualRegister(BoolRC);

          }


          toExact(MBB, Before, SavedWQMReg);

          State = StateExact;

        } else if (State == StateExact && (Needs & StateWQM) &&

                   !(Needs & StateExact)) {

          assert(WQMFromExec == (SavedWQMReg == 0));


          toWQM(MBB, Before, SavedWQMReg);


          if (SavedWQMReg) {

            LIS->createAndComputeVirtRegInterval(SavedWQMReg);

            SavedWQMReg = 0;

          }

          State = StateWQM;

        } else {

          // We can get here if we transitioned from StrictWWM to a

          // non-StrictWWM state that already matches our needs, but we

          // shouldn't need to do anything.

          assert(Needs & State);

        }

      }

    }


    if (Needs != (StateExact | StateWQM | StateStrict)) {

      if (Needs != (StateExact | StateWQM))

        FirstWQM = IE;

      FirstStrict = IE;

    }


    if (II == IE)

      break;


    II = Next;

  }

  assert(!SavedWQMReg);

  assert(!SavedNonStrictReg);

}


void SIWholeQuadMode::lowerLiveMaskQueries() {

  for (MachineInstr *MI : LiveMaskQueries) {

    const DebugLoc &DL = MI->getDebugLoc();

    Register Dest = MI->getOperand(0).getReg();


    MachineInstr *Copy =

        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)

            .addReg(LiveMaskReg);


    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);

    MI->eraseFromParent();

  }

}


void SIWholeQuadMode::lowerCopyInstrs() {

  for (MachineInstr *MI : LowerToMovInstrs) {

    assert(MI->getNumExplicitOperands() == 2);


    const Register Reg = MI->getOperand(0).getReg();


    const TargetRegisterClass *regClass =

        TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));

    if (TRI->isVGPRClass(regClass)) {

      const unsigned MovOp = TII->getMovOpcode(regClass);

      MI->setDesc(TII->get(MovOp));


      // Check that it already implicitly depends on exec (like all VALU movs

      // should do).

      assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {

        return MO.isUse() && MO.getReg() == AMDGPU::EXEC;

      }));

    } else {

      // Remove early-clobber and exec dependency from simple SGPR copies.

      // This allows some to be eliminated during/post RA.

      LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);

      if (MI->getOperand(0).isEarlyClobber()) {

        LIS->removeInterval(Reg);

        MI->getOperand(0).setIsEarlyClobber(false);

        LIS->createAndComputeVirtRegInterval(Reg);

      }

      int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);

      while (Index >= 0) {

        MI->removeOperand(Index);

        Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);

      }

      MI->setDesc(TII->get(AMDGPU::COPY));

      LLVM_DEBUG(dbgs() << "  -> " << *MI);

    }

  }

  for (MachineInstr *MI : LowerToCopyInstrs) {

    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||

        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {

      assert(MI->getNumExplicitOperands() == 3);

      // the only reason we should be here is V_SET_INACTIVE has

      // an undef input so it is being replaced by a simple copy.

      // There should be a second undef source that we should remove.

      assert(MI->getOperand(2).isUndef());

      MI->removeOperand(2);

      MI->untieRegOperand(1);

    } else {

      assert(MI->getNumExplicitOperands() == 2);

    }


    unsigned CopyOp = MI->getOperand(1).isReg()

                          ? (unsigned)AMDGPU::COPY

                          : TII->getMovOpcode(TRI->getRegClassForOperandReg(

                                *MRI, MI->getOperand(0)));

    MI->setDesc(TII->get(CopyOp));

  }

}


void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {

  for (MachineInstr *MI : KillInstrs) {

    MachineBasicBlock *MBB = MI->getParent();

    MachineInstr *SplitPoint = nullptr;

    switch (MI->getOpcode()) {

    case AMDGPU::SI_DEMOTE_I1:

    case AMDGPU::SI_KILL_I1_TERMINATOR:

      SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);

      break;

    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:

      SplitPoint = lowerKillF32(*MBB, *MI);

      break;

    default:

      continue;

    }

    if (SplitPoint)

      splitBlock(MBB, SplitPoint);

  }

}


bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {

  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()

                    << " ------------- \n");

  LLVM_DEBUG(MF.dump(););


  Instructions.clear();

  Blocks.clear();

  LiveMaskQueries.clear();

  LowerToCopyInstrs.clear();

  LowerToMovInstrs.clear();

  KillInstrs.clear();

  StateTransition.clear();


  ST = &MF.getSubtarget<GCNSubtarget>();


  TII = ST->getInstrInfo();

  TRI = &TII->getRegisterInfo();

  MRI = &MF.getRegInfo();

  LIS = &getAnalysis<LiveIntervals>();

  MDT = getAnalysisIfAvailable<MachineDominatorTree>();

  PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();


  if (ST->isWave32()) {

    AndOpc = AMDGPU::S_AND_B32;

    AndTermOpc = AMDGPU::S_AND_B32_term;

    AndN2Opc = AMDGPU::S_ANDN2_B32;

    XorOpc = AMDGPU::S_XOR_B32;

    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;

    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;

    WQMOpc = AMDGPU::S_WQM_B32;

    Exec = AMDGPU::EXEC_LO;

  } else {

    AndOpc = AMDGPU::S_AND_B64;

    AndTermOpc = AMDGPU::S_AND_B64_term;

    AndN2Opc = AMDGPU::S_ANDN2_B64;

    XorOpc = AMDGPU::S_XOR_B64;

    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;

    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;

    WQMOpc = AMDGPU::S_WQM_B64;

    Exec = AMDGPU::EXEC;

  }


  const char GlobalFlags = analyzeFunction(MF);

  const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());


  LiveMaskReg = Exec;


  // Shader is simple does not need any state changes or any complex lowering

  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&

      LowerToMovInstrs.empty() && KillInstrs.empty()) {

    lowerLiveMaskQueries();

    return !LiveMaskQueries.empty();

  }


  MachineBasicBlock &Entry = MF.front();

  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();


  // Store a copy of the original live mask when required

  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {

    LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());

    MachineInstr *MI =

        BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)

            .addReg(Exec);

    LIS->InsertMachineInstrInMaps(*MI);

  }


  LLVM_DEBUG(printInfo());


  lowerLiveMaskQueries();

  lowerCopyInstrs();


  // Shader only needs WQM

  if (GlobalFlags == StateWQM) {

    auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)

                  .addReg(Exec);

    LIS->InsertMachineInstrInMaps(*MI);

    lowerKillInstrs(true);

  } else {

    for (auto BII : Blocks)

      processBlock(*BII.first, BII.first == &Entry);

    // Lowering blocks causes block splitting so perform as a second pass.

    for (auto BII : Blocks)

      lowerBlock(*BII.first);

  }


  // Compute live range for live mask

  if (LiveMaskReg != Exec)

    LIS->createAndComputeVirtRegInterval(LiveMaskReg);


  // Physical registers like SCC aren't tracked by default anyway, so just

  // removing the ranges we computed is the simplest option for maintaining

  // the analysis results.

  LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);


  // If we performed any kills then recompute EXEC

  if (!KillInstrs.empty())

    LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);


  return true;

}

SubReg
unsigned SubReg
Definition: AArch64AdvSIMDScalarPass.cpp:104

MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105

UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:110

MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:72

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:74

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:73

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPU.h

analyzeFunction
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
Definition: AssignmentTrackingAnalysis.cpp:2767

CallingConv.h

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:354

LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101

Blocks
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:113

InitializePasses.h

LiveIntervals.h

MachineBasicBlock.h

MachineDominators.h

MachineFunctionPass.h

MachineInstr.h

MachinePostDominators.h

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1875

MapVector.h
This file implements a map that provides insertion order iteration.

if
if(VerifyEach)
Definition: PassBuilderBindings.cpp:71

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

splitBlock
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
Definition: SILateBranchLowering.cpp:105

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:261

DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIWholeQuadMode.cpp:87

OS
raw_pwrite_stream & OS
Definition: SampleProfWriter.cpp:53

IRDumpFileSuffixType::Before
@ Before

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33

llvm::DenseMap
Definition: DenseMap.h:742

llvm::DominatorTreeBase
Core dominator tree base class.
Definition: GenericDomTree.h:243

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263

llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675

llvm::GCNSubtarget
Definition: GCNSubtarget.h:35

llvm::LiveIntervals
Definition: LiveIntervals.h:53

llvm::LiveQueryResult
Result of a LiveRange query.
Definition: LiveInterval.h:90

llvm::LiveQueryResult::valueIn
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105

llvm::LiveRange
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157

llvm::LiveRange::getSegmentContaining
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408

llvm::LiveRange::Query
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:542

llvm::LiveRange::getVNInfoBefore
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:429

llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74

llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:102

llvm::MachineBasicBlock::pred_end
pred_iterator pred_end()
Definition: MachineBasicBlock.h:379

llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:301

llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:393

llvm::MachineBasicBlock::remove
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
Definition: MachineBasicBlock.h:1047

llvm::MachineBasicBlock::getFirstTerminator
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition: MachineBasicBlock.cpp:242

llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:405

llvm::MachineBasicBlock::getFirstNonPHI
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: MachineBasicBlock.cpp:200

llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:377

llvm::MachineBasicBlock::splitAt
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Definition: MachineBasicBlock.cpp:1017

llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:331

llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:416

llvm::MachineBasicBlock::rbegin
reverse_iterator rbegin()
Definition: MachineBasicBlock.h:333

llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:410

llvm::MachineDominatorTree
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Definition: MachineDominators.h:51

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30

llvm::MachineFunctionPass::getClearedProperties
virtual MachineFunctionProperties getClearedProperties() const
Definition: MachineFunctionPass.h:62

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:168

llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:129

llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:198

llvm::MachineFunction
Definition: MachineFunction.h:259

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:718

llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:609

llvm::MachineFunction::dump
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
Definition: MachineFunction.cpp:604

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:728

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:684

llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:934

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:132

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:225

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:98

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:147

llvm::MachineInstrBundleIterator< MachineInstr >

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:69

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546

llvm::MachineInstr::setDesc
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition: MachineInstr.cpp:142

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48

llvm::MachineOperand::isUndef
bool isUndef() const
Definition: MachineOperand.h:404

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:329

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:369

llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:51

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81

llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19

llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:295

llvm::SIInstrInfo
Definition: SIInstrInfo.h:83

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:32

llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68

llvm::SlotIndex::getBaseIndex
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:227

llvm::SlotIndexes
SlotIndexes pass.
Definition: SlotIndexes.h:300

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135

llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:94

llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950

llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition: SmallVector.h:438

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:426

llvm::SmallVectorTemplateCommon::back
reference back()
Definition: SmallVector.h:321

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50

llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43

llvm::VNInfo
VNInfo - Value Number Information.
Definition: LiveInterval.h:53

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:109

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143

false
Definition: StackSlotColoring.cpp:184

llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:487

llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27

llvm::COFF::Arm64ECThunkType::Exit
@ Exit

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194

llvm::ISD::SETOEQ
@ SETOEQ
Definition: ISDOpcodes.h:1526

llvm::ISD::SETUNE
@ SETUNE
Definition: ISDOpcodes.h:1539

llvm::ISD::SETUEQ
@ SETUEQ
Definition: ISDOpcodes.h:1534

llvm::ISD::SETOLE
@ SETOLE
Definition: ISDOpcodes.h:1530

llvm::ISD::SETOLT
@ SETOLT
Definition: ISDOpcodes.h:1529

llvm::ISD::SETNE
@ SETNE
Definition: ISDOpcodes.h:1548

llvm::ISD::SETUGT
@ SETUGT
Definition: ISDOpcodes.h:1535

llvm::ISD::SETOGT
@ SETOGT
Definition: ISDOpcodes.h:1527

llvm::ISD::SETULT
@ SETULT
Definition: ISDOpcodes.h:1537

llvm::ISD::SETUO
@ SETUO
Definition: ISDOpcodes.h:1533

llvm::ISD::SETONE
@ SETONE
Definition: ISDOpcodes.h:1531

llvm::ISD::SETGT
@ SETGT
Definition: ISDOpcodes.h:1544

llvm::ISD::SETLT
@ SETLT
Definition: ISDOpcodes.h:1546

llvm::ISD::SETO
@ SETO
Definition: ISDOpcodes.h:1532

llvm::ISD::SETGE
@ SETGE
Definition: ISDOpcodes.h:1545

llvm::ISD::SETUGE
@ SETUGE
Definition: ISDOpcodes.h:1536

llvm::ISD::SETLE
@ SETLE
Definition: ISDOpcodes.h:1547

llvm::ISD::SETULE
@ SETULE
Definition: ISDOpcodes.h:1538

llvm::ISD::SETOGE
@ SETOGE
Definition: ISDOpcodes.h:1528

llvm::ISD::SETEQ
@ SETEQ
Definition: ISDOpcodes.h:1543

llvm::MCID::Flag
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:148

llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:45

llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:614

llvm::codeview::FrameCookieKind::Copy
@ Copy

llvm::dwarf::Index
Index
Definition: Dwarf.h:558

llvm::logicalview::LVPrintKind::Instructions
@ Instructions

llvm::omp::RTLDependInfoFields::Flags
@ Flags

llvm::orc::MemProt::Exec
@ Exec

llvm::rdf::Phi
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:363

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:76

llvm::createSIWholeQuadModePass
FunctionPass * createSIWholeQuadModePass()
Definition: SIWholeQuadMode.cpp:266

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.

llvm::SIWholeQuadModeID
char & SIWholeQuadModeID
Definition: SIWholeQuadMode.cpp:264

llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293

llvm::PseudoProbeReservedId::Last
@ Last

llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:120

raw_ostream.h

WorkItem
Definition: WinEHPrepare.cpp:234

llvm::LaneBitmask
Definition: LaneBitmask.h:40

llvm::LaneBitmask::getAll
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:82

llvm::LaneBitmask::any
constexpr bool any() const
Definition: LaneBitmask.h:53

llvm::LaneBitmask::getNone
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81

llvm::LiveRange::Segment
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162

llvm::LiveRange::Segment::start
SlotIndex start
Definition: LiveInterval.h:163

llvm::LiveRange::Segment::end
SlotIndex end
Definition: LiveInterval.h:164