doxygen/MachinePipeliner_8cpp_source.html

//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.

//

// This SMS implementation is a target-independent back-end pass. When enabled,

// the pass runs just prior to the register allocation pass, while the machine

// IR is in SSA form. If software pipelining is successful, then the original

// loop is replaced by the optimized loop. The optimized loop contains one or

// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If

// the instructions cannot be scheduled in a given MII, we increase the MII by

// one and try again.

//

// The SMS implementation is an extension of the ScheduleDAGInstrs class. We

// represent loop carried dependences in the DAG as order edges to the Phi

// nodes. We also perform several passes over the DAG to eliminate unnecessary

// edges that inhibit the ability to pipeline. The implementation uses the

// DFAPacketizer class to compute the minimum initiation interval and the check

// where an instruction may be inserted in the pipelined schedule.

//

// In order for the SMS pass to work, several target specific hooks need to be

// implemented to get information about the loop structure and to rewrite

// instructions.

//

//===----------------------------------------------------------------------===//


#include "llvm/CodeGen/MachinePipeliner.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/BitVector.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/PriorityQueue.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetOperations.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/iterator_range.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/MemoryLocation.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/DFAPacketizer.h"

#include "llvm/CodeGen/LiveIntervals.h"

#include "llvm/CodeGen/MachineBasicBlock.h"

#include "llvm/CodeGen/MachineDominators.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/CodeGen/MachineInstr.h"

#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/CodeGen/MachineMemOperand.h"

#include "llvm/CodeGen/MachineOperand.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

#include "llvm/CodeGen/ModuloSchedule.h"

#include "llvm/CodeGen/Register.h"

#include "llvm/CodeGen/RegisterClassInfo.h"

#include "llvm/CodeGen/RegisterPressure.h"

#include "llvm/CodeGen/ScheduleDAG.h"

#include "llvm/CodeGen/ScheduleDAGMutation.h"

#include "llvm/CodeGen/TargetInstrInfo.h"

#include "llvm/CodeGen/TargetOpcodes.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/CodeGen/TargetRegisterInfo.h"

#include "llvm/CodeGen/TargetSubtargetInfo.h"

#include "llvm/Config/llvm-config.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/Function.h"

#include "llvm/MC/LaneBitmask.h"

#include "llvm/MC/MCInstrDesc.h"

#include "llvm/MC/MCInstrItineraries.h"

#include "llvm/Pass.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/raw_ostream.h"

#include <algorithm>

#include <cassert>

#include <climits>

#include <cstdint>

#include <deque>

#include <functional>

#include <iomanip>

#include <iterator>

#include <map>

#include <memory>

#include <sstream>

#include <tuple>

#include <utility>

#include <vector>


using namespace llvm;


#define DEBUG_TYPE "pipeliner"


STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");

STATISTIC(NumPipelined, "Number of loops software pipelined");

STATISTIC(NumNodeOrderIssues, "Number of node order issues found");

STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");

STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");

STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");

STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");

STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");

STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");

STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");

STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");

STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");


/// A command line option to turn software pipelining on or off.

static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),

                               cl::desc("Enable Software Pipelining"));


/// A command line option to enable SWP at -Os.

static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",

                                      cl::desc("Enable SWP at Os."), cl::Hidden,

                                      cl::init(false));


/// A command line argument to limit minimum initial interval for pipelining.

static cl::opt<int> SwpMaxMii("pipeliner-max-mii",

                              cl::desc("Size limit for the MII."),

                              cl::Hidden, cl::init(27));


/// A command line argument to force pipeliner to use specified initial

/// interval.

static cl::opt<int> SwpForceII("pipeliner-force-ii",

                               cl::desc("Force pipeliner to use specified II."),

                               cl::Hidden, cl::init(-1));


/// A command line argument to limit the number of stages in the pipeline.

static cl::opt<int>

    SwpMaxStages("pipeliner-max-stages",

                 cl::desc("Maximum stages allowed in the generated scheduled."),

                 cl::Hidden, cl::init(3));


/// A command line option to disable the pruning of chain dependences due to

/// an unrelated Phi.

static cl::opt<bool>

    SwpPruneDeps("pipeliner-prune-deps",

                 cl::desc("Prune dependences between unrelated Phi nodes."),

                 cl::Hidden, cl::init(true));


/// A command line option to disable the pruning of loop carried order

/// dependences.

static cl::opt<bool>

    SwpPruneLoopCarried("pipeliner-prune-loop-carried",

                        cl::desc("Prune loop carried order dependences."),

                        cl::Hidden, cl::init(true));


#ifndef NDEBUG

static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));

#endif


static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",

                                     cl::ReallyHidden,

                                     cl::desc("Ignore RecMII"));


static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,

                                    cl::init(false));

static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,

                                      cl::init(false));


static cl::opt<bool> EmitTestAnnotations(

    "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),

    cl::desc("Instead of emitting the pipelined code, annotate instructions "

             "with the generated schedule for feeding into the "

             "-modulo-schedule-test pass"));


static cl::opt<bool> ExperimentalCodeGen(

    "pipeliner-experimental-cg", cl::Hidden, cl::init(false),

    cl::desc(

        "Use the experimental peeling code generator for software pipelining"));


static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",

                                     cl::desc("Range to search for II"),

                                     cl::Hidden, cl::init(10));


static cl::opt<bool>

    LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),

                     cl::desc("Limit register pressure of scheduled loop"));


static cl::opt<int>

    RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,

                      cl::init(5),

                      cl::desc("Margin representing the unused percentage of "

                               "the register pressure limit"));


static cl::opt<bool>

    MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),

               cl::desc("Use the MVE code generator for software pipelining"));


/// A command line argument to limit the number of store instructions in the

/// target basic block.

static cl::opt<unsigned> SwpMaxNumStores(

    "pipeliner-max-num-stores",

    cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,

    cl::init(200));


// A command line option to enable the CopyToPhi DAG mutation.

cl::opt<bool>

    llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,

                             cl::init(true),

                             cl::desc("Enable CopyToPhi DAG Mutation"));


/// A command line argument to force pipeliner to use specified issue

/// width.

cl::opt<int> llvm::SwpForceIssueWidth(

    "pipeliner-force-issue-width",

    cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,

    cl::init(-1));


/// A command line argument to set the window scheduling option.

static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(

    "window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On),

    cl::desc("Set how to use window scheduling algorithm."),

    cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off",

                          "Turn off window algorithm."),

               clEnumValN(WindowSchedulingFlag::WS_On, "on",

                          "Use window algorithm after SMS algorithm fails."),

               clEnumValN(WindowSchedulingFlag::WS_Force, "force",

                          "Use window algorithm instead of SMS algorithm.")));


unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;

char MachinePipeliner::ID = 0;

#ifndef NDEBUG

int MachinePipeliner::NumTries = 0;

#endif

char &llvm::MachinePipelinerID = MachinePipeliner::ID;


INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,

                      "Modulo Software Pipelining", false, false)

INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)

INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)

INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,

                    "Modulo Software Pipelining", false, false)


namespace {


/// This class holds an SUnit corresponding to a memory operation and other

/// information related to the instruction.


struct SUnitWithMemInfo {

  SUnit *SU;

  SmallVector<const Value *, 2> UnderlyingObjs;


  /// The value of a memory operand.

  const Value *MemOpValue = nullptr;


  /// The offset of a memory operand.

  int64_t MemOpOffset = 0;


  AAMDNodes AATags;


  /// True if all the underlying objects are identified.

  bool IsAllIdentified = false;


  SUnitWithMemInfo(SUnit *SU);


  bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;


  bool isUnknown() const { return MemOpValue == nullptr; }


private:

  bool getUnderlyingObjects();

};


/// Add loop-carried chain dependencies. This class handles the same type of

/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into

/// account dependencies across iterations.


class LoopCarriedOrderDepsTracker {

  // Type of instruction that is relevant to order-dependencies

  enum class InstrTag {

    Barrier = 0,      ///< A barrier event instruction.

    LoadOrStore = 1,  ///< An instruction that may load or store memory, but is

                      ///< not a barrier event.

    FPExceptions = 2, ///< An instruction that does not match above, but may

                      ///< raise floatin-point exceptions.

  };


  struct TaggedSUnit : PointerIntPair<SUnit *, 2> {

    TaggedSUnit(SUnit *SU, InstrTag Tag)

        : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}


    InstrTag getTag() const { return InstrTag(getInt()); }

  };


  /// Holds loads and stores with memory related information.

  struct LoadStoreChunk {

    SmallVector<SUnitWithMemInfo, 4> Loads;

    SmallVector<SUnitWithMemInfo, 4> Stores;


    void append(SUnit *SU);

  };


  SwingSchedulerDAG *DAG;

  BatchAAResults *BAA;

  std::vector<SUnit> &SUnits;


  /// The size of SUnits, for convenience.

  const unsigned N;


  /// Loop-carried Edges.

  std::vector<BitVector> LoopCarried;


  /// Instructions related to chain dependencies. They are one of the

  /// following:

  ///

  ///  1. Barrier event.

  ///  2. Load, but neither a barrier event, invariant load, nor may load trap

  ///     value.

  ///  3. Store, but not a barrier event.

  ///  4. None of them, but may raise floating-point exceptions.

  ///

  /// This is used when analyzing loop-carried dependencies that access global

  /// barrier instructions.

  std::vector<TaggedSUnit> TaggedSUnits;


  const TargetInstrInfo *TII = nullptr;

  const TargetRegisterInfo *TRI = nullptr;


public:

  LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA,

                              const TargetInstrInfo *TII,

                              const TargetRegisterInfo *TRI);


  /// The main function to compute loop-carried order-dependencies.

  void computeDependencies();


  const BitVector &getLoopCarried(unsigned Idx) const {

    return LoopCarried[Idx];

  }


private:

  /// Tags to \p SU if the instruction may affect the order-dependencies.

  std::optional<InstrTag> getInstrTag(SUnit *SU) const;


  void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,

                                          const LoadStoreChunk &To);


  /// Add a loop-carried order dependency between \p Src and \p Dst if we

  /// cannot prove they are independent. When \p PerformCheapCheck is true, a

  /// lightweight dependency test (referred to as "cheap check" below) is

  /// performed at first. Note that the cheap check is retained to maintain the

  /// existing behavior and not expected to be used anymore.

  ///

  /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.

  void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,

                                 const SUnitWithMemInfo &Dst,

                                 bool PerformCheapCheck = false);


  void computeDependenciesAux();

};


} // end anonymous namespace


/// The "main" function for implementing Swing Modulo Scheduling.


bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {

  if (skipFunction(mf.getFunction()))

    return false;


  if (!EnableSWP)

    return false;


  if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&

      !EnableSWPOptSize.getPosition())

    return false;


  if (!mf.getSubtarget().enableMachinePipeliner())

    return false;


  // Cannot pipeline loops without instruction itineraries if we are using

  // DFA for the pipeliner.

  if (mf.getSubtarget().useDFAforSMS() &&

      (!mf.getSubtarget().getInstrItineraryData() ||

       mf.getSubtarget().getInstrItineraryData()->isEmpty()))

    return false;


  MF = &mf;

  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();

  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();

  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();

  TII = MF->getSubtarget().getInstrInfo();

  RegClassInfo.runOnMachineFunction(*MF);


  for (const auto &L : *MLI)

    scheduleLoop(*L);


  return false;

}


/// Attempt to perform the SMS algorithm on the specified loop. This function is

/// the main entry point for the algorithm.  The function identifies candidate

/// loops, calculates the minimum initiation interval, and attempts to schedule

/// the loop.

bool MachinePipeliner::scheduleLoop(MachineLoop &L) {

  bool Changed = false;

  for (const auto &InnerLoop : L)

    Changed |= scheduleLoop(*InnerLoop);


#ifndef NDEBUG

  // Stop trying after reaching the limit (if any).

  int Limit = SwpLoopLimit;

  if (Limit >= 0) {

    if (NumTries >= SwpLoopLimit)

      return Changed;

    NumTries++;

  }

#endif


  setPragmaPipelineOptions(L);

  if (!canPipelineLoop(L)) {

    LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");

    ORE->emit([&]() {

      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",

                                             L.getStartLoc(), L.getHeader())

             << "Failed to pipeline loop";

    });


    LI.LoopPipelinerInfo.reset();

    return Changed;

  }


  ++NumTrytoPipeline;

  if (useSwingModuloScheduler())

    Changed = swingModuloScheduler(L);


  if (useWindowScheduler(Changed))

    Changed = runWindowScheduler(L);


  LI.LoopPipelinerInfo.reset();

  return Changed;

}


void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {

  // Reset the pragma for the next loop in iteration.

  disabledByPragma = false;

  II_setByPragma = 0;


  MachineBasicBlock *LBLK = L.getTopBlock();


  if (LBLK == nullptr)

    return;


  const BasicBlock *BBLK = LBLK->getBasicBlock();

  if (BBLK == nullptr)

    return;


  const Instruction *TI = BBLK->getTerminator();

  if (TI == nullptr)

    return;


  MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);

  if (LoopID == nullptr)

    return;


  assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");

  assert(LoopID->getOperand(0) == LoopID && "invalid loop");


  for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {

    MDNode *MD = dyn_cast<MDNode>(MDO);


    if (MD == nullptr)

      continue;


    MDString *S = dyn_cast<MDString>(MD->getOperand(0));


    if (S == nullptr)

      continue;


    if (S->getString() == "llvm.loop.pipeline.initiationinterval") {

      assert(MD->getNumOperands() == 2 &&

             "Pipeline initiation interval hint metadata should have two operands.");

      II_setByPragma =

          mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();

      assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");

    } else if (S->getString() == "llvm.loop.pipeline.disable") {

      disabledByPragma = true;

    }

  }

}


/// Return true if the loop can be software pipelined.  The algorithm is

/// restricted to loops with a single basic block.  Make sure that the

/// branch in the loop can be analyzed.

bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {

  if (L.getNumBlocks() != 1) {

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "Not a single basic block: "

             << ore::NV("NumBlocks", L.getNumBlocks());

    });

    return false;

  }


  if (disabledByPragma) {

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "Disabled by Pragma.";

    });

    return false;

  }


  // Check if the branch can't be understood because we can't do pipelining

  // if that's the case.

  LI.TBB = nullptr;

  LI.FBB = nullptr;

  LI.BrCond.clear();

  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {

    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");

    NumFailBranch++;

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "The branch can't be understood";

    });

    return false;

  }


  LI.LoopInductionVar = nullptr;

  LI.LoopCompare = nullptr;

  LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());

  if (!LI.LoopPipelinerInfo) {

    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");

    NumFailLoop++;

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "The loop structure is not supported";

    });

    return false;

  }


  if (!L.getLoopPreheader()) {

    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");

    NumFailPreheader++;

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "No loop preheader found";

    });

    return false;

  }


  unsigned NumStores = 0;

  for (MachineInstr &MI : *L.getHeader())

    if (MI.mayStore())

      ++NumStores;

  if (NumStores > SwpMaxNumStores) {

    LLVM_DEBUG(dbgs() << "Too many stores\n");

    NumFailTooManyStores++;

    ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",

                                               L.getStartLoc(), L.getHeader())

             << "Too many store instructions in the loop: "

             << ore::NV("NumStores", NumStores) << " > "

             << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";

    });

    return false;

  }


  // Remove any subregisters from inputs to phi nodes.

  preprocessPhiNodes(*L.getHeader());

  return true;

}


void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {

  MachineRegisterInfo &MRI = MF->getRegInfo();

  SlotIndexes &Slots =

      *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();


  for (MachineInstr &PI : B.phis()) {

    MachineOperand &DefOp = PI.getOperand(0);

    assert(DefOp.getSubReg() == 0);

    auto *RC = MRI.getRegClass(DefOp.getReg());


    for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {

      MachineOperand &RegOp = PI.getOperand(i);

      if (RegOp.getSubReg() == 0)

        continue;


      // If the operand uses a subregister, replace it with a new register

      // without subregisters, and generate a copy to the new register.

      Register NewReg = MRI.createVirtualRegister(RC);

      MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();

      MachineBasicBlock::iterator At = PredB.getFirstTerminator();

      const DebugLoc &DL = PredB.findDebugLoc(At);

      auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)

                    .addReg(RegOp.getReg(), getRegState(RegOp),

                            RegOp.getSubReg());

      Slots.insertMachineInstrInMaps(*Copy);

      RegOp.setReg(NewReg);

      RegOp.setSubReg(0);

    }

  }

}


/// The SMS algorithm consists of the following main steps:

/// 1. Computation and analysis of the dependence graph.

/// 2. Ordering of the nodes (instructions).

/// 3. Attempt to Schedule the loop.

bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {

  assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");


  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

  SwingSchedulerDAG SMS(

      *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,

      II_setByPragma, LI.LoopPipelinerInfo.get(), AA);


  MachineBasicBlock *MBB = L.getHeader();

  // The kernel should not include any terminator instructions.  These

  // will be added back later.

  SMS.startBlock(MBB);


  // Compute the number of 'real' instructions in the basic block by

  // ignoring terminators.

  unsigned size = MBB->size();

  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),

                                   E = MBB->instr_end();

       I != E; ++I, --size)

    ;


  SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);

  SMS.schedule();

  SMS.exitRegion();


  SMS.finishBlock();

  return SMS.hasNewSchedule();

}


void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {

  AU.addRequired<AAResultsWrapperPass>();

  AU.addPreserved<AAResultsWrapperPass>();

  AU.addRequired<MachineLoopInfoWrapperPass>();

  AU.addRequired<MachineDominatorTreeWrapperPass>();

  AU.addRequired<LiveIntervalsWrapperPass>();

  AU.addRequired<MachineOptimizationRemarkEmitterPass>();

  AU.addRequired<TargetPassConfig>();

  MachineFunctionPass::getAnalysisUsage(AU);

}


bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {

  MachineSchedContext Context;

  Context.MF = MF;

  Context.MLI = MLI;

  Context.MDT = MDT;

  Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();

  Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

  Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();

  Context.RegClassInfo->runOnMachineFunction(*MF);

  WindowScheduler WS(&Context, L);

  return WS.run();

}


bool MachinePipeliner::useSwingModuloScheduler() {

  // SwingModuloScheduler does not work when WindowScheduler is forced.

  return WindowSchedulingOption != WindowSchedulingFlag::WS_Force;

}


bool MachinePipeliner::useWindowScheduler(bool Changed) {

  // WindowScheduler does not work for following cases:

  // 1. when it is off.

  // 2. when SwingModuloScheduler is successfully scheduled.

  // 3. when pragma II is enabled.

  if (II_setByPragma) {

    LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "

                         "llvm.loop.pipeline.initiationinterval is set.\n");

    return false;

  }


  return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||

         (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);

}


void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {

  if (SwpForceII > 0)

    MII = SwpForceII;

  else if (II_setByPragma > 0)

    MII = II_setByPragma;

  else

    MII = std::max(ResMII, RecMII);

}


void SwingSchedulerDAG::setMAX_II() {

  if (SwpForceII > 0)

    MAX_II = SwpForceII;

  else if (II_setByPragma > 0)

    MAX_II = II_setByPragma;

  else

    MAX_II = MII + SwpIISearchRange;

}


/// We override the schedule function in ScheduleDAGInstrs to implement the

/// scheduling part of the Swing Modulo Scheduling algorithm.


void SwingSchedulerDAG::schedule() {

  buildSchedGraph(AA);

  const LoopCarriedEdges LCE = addLoopCarriedDependences();

  updatePhiDependences();

  Topo.InitDAGTopologicalSorting();

  changeDependences();

  postProcessDAG();

  DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);

  LLVM_DEBUG({

    dump();

    dbgs() << "===== Loop Carried Edges Begin =====\n";

    for (SUnit &SU : SUnits)

      LCE.dump(&SU, TRI, &MRI);

    dbgs() << "===== Loop Carried Edges End =====\n";

  });


  NodeSetType NodeSets;

  findCircuits(NodeSets);

  NodeSetType Circuits = NodeSets;


  // Calculate the MII.

  unsigned ResMII = calculateResMII();

  unsigned RecMII = calculateRecMII(NodeSets);


  fuseRecs(NodeSets);


  // This flag is used for testing and can cause correctness problems.

  if (SwpIgnoreRecMII)

    RecMII = 0;


  setMII(ResMII, RecMII);

  setMAX_II();


  LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II

                    << " (rec=" << RecMII << ", res=" << ResMII << ")\n");


  // Can't schedule a loop without a valid MII.

  if (MII == 0) {

    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");

    NumFailZeroMII++;

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "Invalid Minimal Initiation Interval: 0";

    });

    return;

  }


  // Don't pipeline large loops.

  if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {

    LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii

                      << ", we don't pipeline large loops\n");

    NumFailLargeMaxMII++;

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "Minimal Initiation Interval too large: "

             << ore::NV("MII", (int)MII) << " > "

             << ore::NV("SwpMaxMii", SwpMaxMii) << "."

             << "Refer to -pipeliner-max-mii.";

    });

    return;

  }


  computeNodeFunctions(NodeSets);


  registerPressureFilter(NodeSets);


  colocateNodeSets(NodeSets);


  checkNodeSets(NodeSets);


  LLVM_DEBUG({

    for (auto &I : NodeSets) {

      dbgs() << "  Rec NodeSet ";

      I.dump();

    }

  });


  llvm::stable_sort(NodeSets, std::greater<NodeSet>());


  groupRemainingNodes(NodeSets);


  removeDuplicateNodes(NodeSets);


  LLVM_DEBUG({

    for (auto &I : NodeSets) {

      dbgs() << "  NodeSet ";

      I.dump();

    }

  });


  computeNodeOrder(NodeSets);


  // check for node order issues

  checkValidNodeOrder(Circuits);


  SMSchedule Schedule(Pass.MF, this);

  Scheduled = schedulePipeline(Schedule);


  if (!Scheduled){

    LLVM_DEBUG(dbgs() << "No schedule found, return\n");

    NumFailNoSchedule++;

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "Unable to find schedule";

    });

    return;

  }


  unsigned numStages = Schedule.getMaxStageCount();

  // No need to generate pipeline if there are no overlapped iterations.

  if (numStages == 0) {

    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");

    NumFailZeroStage++;

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "No need to pipeline - no overlapped iterations in schedule.";

    });

    return;

  }

  // Check that the maximum stage count is less than user-defined limit.

  if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {

    LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages

                      << " : too many stages, abort\n");

    NumFailLargeMaxStage++;

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "Too many stages in schedule: "

             << ore::NV("numStages", (int)numStages) << " > "

             << ore::NV("SwpMaxStages", SwpMaxStages)

             << ". Refer to -pipeliner-max-stages.";

    });

    return;

  }


  Pass.ORE->emit([&]() {

    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),

                                     Loop.getHeader())

           << "Pipelined succesfully!";

  });


  // Generate the schedule as a ModuloSchedule.

  DenseMap<MachineInstr *, int> Cycles, Stages;

  std::vector<MachineInstr *> OrderedInsts;

  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();

       ++Cycle) {

    for (SUnit *SU : Schedule.getInstructions(Cycle)) {

      OrderedInsts.push_back(SU->getInstr());

      Cycles[SU->getInstr()] = Cycle;

      Stages[SU->getInstr()] = Schedule.stageScheduled(SU);

    }

  }

  DenseMap<MachineInstr *, std::pair<Register, int64_t>> NewInstrChanges;

  for (auto &KV : NewMIs) {

    Cycles[KV.first] = Cycles[KV.second];

    Stages[KV.first] = Stages[KV.second];

    NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];

  }


  ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),

                    std::move(Stages));

  if (EmitTestAnnotations) {

    assert(NewInstrChanges.empty() &&

           "Cannot serialize a schedule with InstrChanges!");

    ModuloScheduleTestAnnotater MSTI(MF, MS);

    MSTI.annotate();

    return;

  }

  // The experimental code generator can't work if there are InstChanges.

  if (ExperimentalCodeGen && NewInstrChanges.empty()) {

    PeelingModuloScheduleExpander MSE(MF, MS, &LIS);

    MSE.expand();

  } else if (MVECodeGen && NewInstrChanges.empty() &&

             LoopPipelinerInfo->isMVEExpanderSupported() &&

             ModuloScheduleExpanderMVE::canApply(Loop)) {

    ModuloScheduleExpanderMVE MSE(MF, MS, LIS);

    MSE.expand();

  } else {

    ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));

    MSE.expand();

    MSE.cleanup();

  }

  ++NumPipelined;

}


/// Clean up after the software pipeliner runs.


void SwingSchedulerDAG::finishBlock() {

  for (auto &KV : NewMIs)

    MF.deleteMachineInstr(KV.second);

  NewMIs.clear();


  // Call the superclass.

  ScheduleDAGInstrs::finishBlock();

}


/// Return the register values for  the operands of a Phi instruction.

/// This function assume the instruction is a Phi.


static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,

                       Register &InitVal, Register &LoopVal) {

  assert(Phi.isPHI() && "Expecting a Phi.");


  InitVal = Register();

  LoopVal = Register();

  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)

    if (Phi.getOperand(i + 1).getMBB() != Loop)

      InitVal = Phi.getOperand(i).getReg();

    else

      LoopVal = Phi.getOperand(i).getReg();


  assert(InitVal && LoopVal && "Unexpected Phi structure.");

}


/// Return the Phi register value that comes the loop block.


static Register getLoopPhiReg(const MachineInstr &Phi,

                              const MachineBasicBlock *LoopBB) {

  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)

    if (Phi.getOperand(i + 1).getMBB() == LoopBB)

      return Phi.getOperand(i).getReg();

  return Register();

}


/// Return true if SUb can be reached from SUa following the chain edges.


static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {

  SmallPtrSet<SUnit *, 8> Visited;

  SmallVector<SUnit *, 8> Worklist;

  Worklist.push_back(SUa);

  while (!Worklist.empty()) {

    const SUnit *SU = Worklist.pop_back_val();

    for (const auto &SI : SU->Succs) {

      SUnit *SuccSU = SI.getSUnit();

      if (SI.getKind() == SDep::Order) {

        if (Visited.count(SuccSU))

          continue;

        if (SuccSU == SUb)

          return true;

        Worklist.push_back(SuccSU);

        Visited.insert(SuccSU);

      }

    }

  }

  return false;

}


SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {

  if (!getUnderlyingObjects())

    return;

  for (const Value *Obj : UnderlyingObjs)

    if (!isIdentifiedObject(Obj)) {

      IsAllIdentified = false;

      break;

    }

}


bool SUnitWithMemInfo::isTriviallyDisjoint(

    const SUnitWithMemInfo &Other) const {

  // If all underlying objects are identified objects and there is no overlap

  // between them, then these two instructions are disjoint.

  if (!IsAllIdentified || !Other.IsAllIdentified)

    return false;

  for (const Value *Obj : UnderlyingObjs)

    if (llvm::is_contained(Other.UnderlyingObjs, Obj))

      return false;

  return true;

}


/// Collect the underlying objects for the memory references of an instruction.

/// This function calls the code in ValueTracking, but first checks that the

/// instruction has a memory operand.

/// Returns false if we cannot find the underlying objects.

bool SUnitWithMemInfo::getUnderlyingObjects() {

  const MachineInstr *MI = SU->getInstr();

  if (!MI->hasOneMemOperand())

    return false;

  MachineMemOperand *MM = *MI->memoperands_begin();

  if (!MM->getValue())

    return false;

  MemOpValue = MM->getValue();

  MemOpOffset = MM->getOffset();

  llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs);


  // TODO: A no alias scope may be valid only in a single iteration. In this

  // case we need to peel off it like LoopAccessAnalysis does.

  AATags = MM->getAAInfo();

  return true;

}


/// Returns true if there is a loop-carried order dependency from \p Src to \p

/// Dst.

static bool


hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,

                     BatchAAResults &BAA, const TargetInstrInfo *TII,

                     const TargetRegisterInfo *TRI,

                     const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {

  if (Src.isTriviallyDisjoint(Dst))

    return false;

  if (isSuccOrder(Src.SU, Dst.SU))

    return false;


  MachineInstr &SrcMI = *Src.SU->getInstr();

  MachineInstr &DstMI = *Dst.SU->getInstr();

  if (PerformCheapCheck) {

    // First, perform the cheaper check that compares the base register.

    // If they are the same and the load offset is less than the store

    // offset, then mark the dependence as loop carried potentially.

    //

    // TODO: This check will be removed.

    const MachineOperand *BaseOp1, *BaseOp2;

    int64_t Offset1, Offset2;

    bool Offset1IsScalable, Offset2IsScalable;

    if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,

                                     TRI) &&

        TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,

                                     TRI)) {

      if (BaseOp1->isIdenticalTo(*BaseOp2) &&

          Offset1IsScalable == Offset2IsScalable &&

          (int)Offset1 < (int)Offset2) {

        assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&

               "What happened to the chain edge?");

        return true;

      }

    }

  }


  if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))

    return false;


  // Second, the more expensive check that uses alias analysis on the

  // base registers. If they alias, and the load offset is less than

  // the store offset, the mark the dependence as loop carried.

  if (Src.isUnknown() || Dst.isUnknown())

    return true;

  if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)

    return true;


  if (BAA.isNoAlias(

          MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),

          MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))

    return false;


  // AliasAnalysis sometimes gives up on following the underlying

  // object. In such a case, separate checks for underlying objects may

  // prove that there are no aliases between two accesses.

  for (const Value *SrcObj : Src.UnderlyingObjs)

    for (const Value *DstObj : Dst.UnderlyingObjs)

      if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),

                         MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))

        return true;


  return false;

}


void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {

  const MachineInstr *MI = SU->getInstr();

  if (!MI->mayLoadOrStore())

    return;

  (MI->mayStore() ? Stores : Loads).emplace_back(SU);

}


LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker(

    SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,

    const TargetRegisterInfo *TRI)

    : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),

      LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}


void LoopCarriedOrderDepsTracker::computeDependencies() {

  // Traverse all instructions and extract only what we are targetting.

  for (auto &SU : SUnits) {

    auto Tagged = getInstrTag(&SU);


    // This instruction has no loop-carried order-dependencies.

    if (!Tagged)

      continue;

    TaggedSUnits.emplace_back(&SU, *Tagged);

  }


  computeDependenciesAux();

}


std::optional<LoopCarriedOrderDepsTracker::InstrTag>

LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {

  MachineInstr *MI = SU->getInstr();

  if (TII->isGlobalMemoryObject(MI))

    return InstrTag::Barrier;


  if (MI->mayStore() ||

      (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))

    return InstrTag::LoadOrStore;


  if (MI->mayRaiseFPException())

    return InstrTag::FPExceptions;


  return std::nullopt;

}


void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(

    const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,

    bool PerformCheapCheck) {

  // Avoid self-dependencies.

  if (Src.SU == Dst.SU)

    return;


  if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))

    LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);

}


void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(

    const LoadStoreChunk &From, const LoadStoreChunk &To) {

  // Add load-to-store dependencies (WAR).

  for (const SUnitWithMemInfo &Src : From.Loads)

    for (const SUnitWithMemInfo &Dst : To.Stores)

      // Perform a cheap check first if this is a forward dependency.

      addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);


  // Add store-to-load dependencies (RAW).

  for (const SUnitWithMemInfo &Src : From.Stores)

    for (const SUnitWithMemInfo &Dst : To.Loads)

      addDependenciesBetweenSUs(Src, Dst);


  // Add store-to-store dependencies (WAW).

  for (const SUnitWithMemInfo &Src : From.Stores)

    for (const SUnitWithMemInfo &Dst : To.Stores)

      addDependenciesBetweenSUs(Src, Dst);

}


void LoopCarriedOrderDepsTracker::computeDependenciesAux() {

  SmallVector<LoadStoreChunk, 2> Chunks(1);

  for (const auto &TSU : TaggedSUnits) {

    InstrTag Tag = TSU.getTag();

    SUnit *SU = TSU.getPointer();

    switch (Tag) {

    case InstrTag::Barrier:

      Chunks.emplace_back();

      break;

    case InstrTag::LoadOrStore:

      Chunks.back().append(SU);

      break;

    case InstrTag::FPExceptions:

      // TODO: Handle this properly.

      break;

    }

  }


  // Add dependencies between memory operations. If there are one or more

  // barrier events between two memory instructions, we don't add a

  // loop-carried dependence for them.

  for (const LoadStoreChunk &Chunk : Chunks)

    addLoopCarriedDepenenciesForChunks(Chunk, Chunk);


  // TODO: If there are multiple barrier instructions, dependencies from the

  // last barrier instruction (or load/store below it) to the first barrier

  // instruction (or load/store above it).

}


/// Add a chain edge between a load and store if the store can be an

/// alias of the load on a subsequent iteration, i.e., a loop carried

/// dependence. This code is very similar to the code in ScheduleDAGInstrs

/// but that code doesn't create loop carried dependences.

/// TODO: Also compute output-dependencies.

LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {

  LoopCarriedEdges LCE;


  // Add loop-carried order-dependencies

  LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);

  LCODTracker.computeDependencies();

  for (unsigned I = 0; I != SUnits.size(); I++)

    for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())

      LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);


  LCE.modifySUnits(SUnits, TII);

  return LCE;

}


/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer

/// processes dependences for PHIs. This function adds true dependences

/// from a PHI to a use, and a loop carried dependence from the use to the

/// PHI. The loop carried dependence is represented as an anti dependence

/// edge. This function also removes chain dependences between unrelated

/// PHIs.

void SwingSchedulerDAG::updatePhiDependences() {

  SmallVector<SDep, 4> RemoveDeps;

  const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();


  // Iterate over each DAG node.

  for (SUnit &I : SUnits) {

    RemoveDeps.clear();

    // Set to true if the instruction has an operand defined by a Phi.

    Register HasPhiUse;

    Register HasPhiDef;

    MachineInstr *MI = I.getInstr();

    // Iterate over each operand, and we process the definitions.

    for (const MachineOperand &MO : MI->operands()) {

      if (!MO.isReg())

        continue;

      Register Reg = MO.getReg();

      if (MO.isDef()) {

        // If the register is used by a Phi, then create an anti dependence.

        for (MachineRegisterInfo::use_instr_iterator

                 UI = MRI.use_instr_begin(Reg),

                 UE = MRI.use_instr_end();

             UI != UE; ++UI) {

          MachineInstr *UseMI = &*UI;

          SUnit *SU = getSUnit(UseMI);

          if (SU != nullptr && UseMI->isPHI()) {

            if (!MI->isPHI()) {

              SDep Dep(SU, SDep::Anti, Reg);

              Dep.setLatency(1);

              I.addPred(Dep);

            } else {

              HasPhiDef = Reg;

              // Add a chain edge to a dependent Phi that isn't an existing

              // predecessor.

              if (SU->NodeNum < I.NodeNum && !I.isPred(SU))

                I.addPred(SDep(SU, SDep::Barrier));

            }

          }

        }

      } else if (MO.isUse()) {

        // If the register is defined by a Phi, then create a true dependence.

        MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);

        if (DefMI == nullptr)

          continue;

        SUnit *SU = getSUnit(DefMI);

        if (SU != nullptr && DefMI->isPHI()) {

          if (!MI->isPHI()) {

            SDep Dep(SU, SDep::Data, Reg);

            Dep.setLatency(0);

            ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,

                                     &SchedModel);

            I.addPred(Dep);

          } else {

            HasPhiUse = Reg;

            // Add a chain edge to a dependent Phi that isn't an existing

            // predecessor.

            if (SU->NodeNum < I.NodeNum && !I.isPred(SU))

              I.addPred(SDep(SU, SDep::Barrier));

          }

        }

      }

    }

    // Remove order dependences from an unrelated Phi.

    if (!SwpPruneDeps)

      continue;

    for (auto &PI : I.Preds) {

      MachineInstr *PMI = PI.getSUnit()->getInstr();

      if (PMI->isPHI() && PI.getKind() == SDep::Order) {

        if (I.getInstr()->isPHI()) {

          if (PMI->getOperand(0).getReg() == HasPhiUse)

            continue;

          if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)

            continue;

        }

        RemoveDeps.push_back(PI);

      }

    }

    for (const SDep &D : RemoveDeps)

      I.removePred(D);

  }

}


/// Iterate over each DAG node and see if we can change any dependences

/// in order to reduce the recurrence MII.

void SwingSchedulerDAG::changeDependences() {

  // See if an instruction can use a value from the previous iteration.

  // If so, we update the base and offset of the instruction and change

  // the dependences.

  for (SUnit &I : SUnits) {

    unsigned BasePos = 0, OffsetPos = 0;

    Register NewBase;

    int64_t NewOffset = 0;

    if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,

                               NewOffset))

      continue;


    // Get the MI and SUnit for the instruction that defines the original base.

    Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();

    MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);

    if (!DefMI)

      continue;

    SUnit *DefSU = getSUnit(DefMI);

    if (!DefSU)

      continue;

    // Get the MI and SUnit for the instruction that defins the new base.

    MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);

    if (!LastMI)

      continue;

    SUnit *LastSU = getSUnit(LastMI);

    if (!LastSU)

      continue;


    if (Topo.IsReachable(&I, LastSU))

      continue;


    // Remove the dependence. The value now depends on a prior iteration.

    SmallVector<SDep, 4> Deps;

    for (const SDep &P : I.Preds)

      if (P.getSUnit() == DefSU)

        Deps.push_back(P);

    for (const SDep &D : Deps) {

      Topo.RemovePred(&I, D.getSUnit());

      I.removePred(D);

    }

    // Remove the chain dependence between the instructions.

    Deps.clear();

    for (auto &P : LastSU->Preds)

      if (P.getSUnit() == &I && P.getKind() == SDep::Order)

        Deps.push_back(P);

    for (const SDep &D : Deps) {

      Topo.RemovePred(LastSU, D.getSUnit());

      LastSU->removePred(D);

    }


    // Add a dependence between the new instruction and the instruction

    // that defines the new base.

    SDep Dep(&I, SDep::Anti, NewBase);

    Topo.AddPred(LastSU, &I);

    LastSU->addPred(Dep);


    // Remember the base and offset information so that we can update the

    // instruction during code generation.

    InstrChanges[&I] = std::make_pair(NewBase, NewOffset);

  }

}


/// Create an instruction stream that represents a single iteration and stage of

/// each instruction. This function differs from SMSchedule::finalizeSchedule in

/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this

/// function is an approximation of SMSchedule::finalizeSchedule with all

/// non-const operations removed.


static void computeScheduledInsts(const SwingSchedulerDAG *SSD,

                                  SMSchedule &Schedule,

                                  std::vector<MachineInstr *> &OrderedInsts,

                                  DenseMap<MachineInstr *, unsigned> &Stages) {

  DenseMap<int, std::deque<SUnit *>> Instrs;


  // Move all instructions to the first stage from the later stages.

  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();

       ++Cycle) {

    for (int Stage = 0, LastStage = Schedule.getMaxStageCount();

         Stage <= LastStage; ++Stage) {

      for (SUnit *SU : llvm::reverse(Schedule.getInstructions(

               Cycle + Stage * Schedule.getInitiationInterval()))) {

        Instrs[Cycle].push_front(SU);

      }

    }

  }


  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();

       ++Cycle) {

    std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];

    CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);

    for (SUnit *SU : CycleInstrs) {

      MachineInstr *MI = SU->getInstr();

      OrderedInsts.push_back(MI);

      Stages[MI] = Schedule.stageScheduled(SU);

    }

  }

}


namespace {


// FuncUnitSorter - Comparison operator used to sort instructions by

// the number of functional unit choices.

struct FuncUnitSorter {

  const InstrItineraryData *InstrItins;

  const MCSubtargetInfo *STI;

  DenseMap<InstrStage::FuncUnits, unsigned> Resources;


  FuncUnitSorter(const TargetSubtargetInfo &TSI)

      : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}


  // Compute the number of functional unit alternatives needed

  // at each stage, and take the minimum value. We prioritize the

  // instructions by the least number of choices first.

  unsigned minFuncUnits(const MachineInstr *Inst,

                        InstrStage::FuncUnits &F) const {

    unsigned SchedClass = Inst->getDesc().getSchedClass();

    unsigned min = UINT_MAX;

    if (InstrItins && !InstrItins->isEmpty()) {

      for (const InstrStage &IS :

           make_range(InstrItins->beginStage(SchedClass),

                      InstrItins->endStage(SchedClass))) {

        InstrStage::FuncUnits funcUnits = IS.getUnits();

        unsigned numAlternatives = llvm::popcount(funcUnits);

        if (numAlternatives < min) {

          min = numAlternatives;

          F = funcUnits;

        }

      }

      return min;

    }

    if (STI && STI->getSchedModel().hasInstrSchedModel()) {

      const MCSchedClassDesc *SCDesc =

          STI->getSchedModel().getSchedClassDesc(SchedClass);

      if (!SCDesc->isValid())

        // No valid Schedule Class Desc for schedClass, should be

        // Pseudo/PostRAPseudo

        return min;


      for (const MCWriteProcResEntry &PRE :

           make_range(STI->getWriteProcResBegin(SCDesc),

                      STI->getWriteProcResEnd(SCDesc))) {

        if (!PRE.ReleaseAtCycle)

          continue;

        const MCProcResourceDesc *ProcResource =

            STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);

        unsigned NumUnits = ProcResource->NumUnits;

        if (NumUnits < min) {

          min = NumUnits;

          F = PRE.ProcResourceIdx;

        }

      }

      return min;

    }

    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");

  }


  // Compute the critical resources needed by the instruction. This

  // function records the functional units needed by instructions that

  // must use only one functional unit. We use this as a tie breaker

  // for computing the resource MII. The instrutions that require

  // the same, highly used, functional unit have high priority.

  void calcCriticalResources(MachineInstr &MI) {

    unsigned SchedClass = MI.getDesc().getSchedClass();

    if (InstrItins && !InstrItins->isEmpty()) {

      for (const InstrStage &IS :

           make_range(InstrItins->beginStage(SchedClass),

                      InstrItins->endStage(SchedClass))) {

        InstrStage::FuncUnits FuncUnits = IS.getUnits();

        if (llvm::popcount(FuncUnits) == 1)

          Resources[FuncUnits]++;

      }

      return;

    }

    if (STI && STI->getSchedModel().hasInstrSchedModel()) {

      const MCSchedClassDesc *SCDesc =

          STI->getSchedModel().getSchedClassDesc(SchedClass);

      if (!SCDesc->isValid())

        // No valid Schedule Class Desc for schedClass, should be

        // Pseudo/PostRAPseudo

        return;


      for (const MCWriteProcResEntry &PRE :

           make_range(STI->getWriteProcResBegin(SCDesc),

                      STI->getWriteProcResEnd(SCDesc))) {

        if (!PRE.ReleaseAtCycle)

          continue;

        Resources[PRE.ProcResourceIdx]++;

      }

      return;

    }

    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");

  }


  /// Return true if IS1 has less priority than IS2.

  bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {

    InstrStage::FuncUnits F1 = 0, F2 = 0;

    unsigned MFUs1 = minFuncUnits(IS1, F1);

    unsigned MFUs2 = minFuncUnits(IS2, F2);

    if (MFUs1 == MFUs2)

      return Resources.lookup(F1) < Resources.lookup(F2);

    return MFUs1 > MFUs2;

  }

};


/// Calculate the maximum register pressure of the scheduled instructions stream

class HighRegisterPressureDetector {

  MachineBasicBlock *OrigMBB;

  const MachineRegisterInfo &MRI;

  const TargetRegisterInfo *TRI;


  const unsigned PSetNum;


  // Indexed by PSet ID

  // InitSetPressure takes into account the register pressure of live-in

  // registers. It's not depend on how the loop is scheduled, so it's enough to

  // calculate them once at the beginning.

  std::vector<unsigned> InitSetPressure;


  // Indexed by PSet ID

  // Upper limit for each register pressure set

  std::vector<unsigned> PressureSetLimit;


  DenseMap<MachineInstr *, RegisterOperands> ROMap;


  using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;


public:

  using OrderedInstsTy = std::vector<MachineInstr *>;

  using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;


private:

  static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {

    if (Pressures.size() == 0) {

      dbgs() << "[]";

    } else {

      char Prefix = '[';

      for (unsigned P : Pressures) {

        dbgs() << Prefix << P;

        Prefix = ' ';

      }

      dbgs() << ']';

    }

  }


  void dumpPSet(Register Reg) const {

    dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";

    for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();

         ++PSetIter) {

      dbgs() << *PSetIter << ' ';

    }

    dbgs() << '\n';

  }


  void increaseRegisterPressure(std::vector<unsigned> &Pressure,

                                Register Reg) const {

    auto PSetIter = MRI.getPressureSets(Reg);

    unsigned Weight = PSetIter.getWeight();

    for (; PSetIter.isValid(); ++PSetIter)

      Pressure[*PSetIter] += Weight;

  }


  void decreaseRegisterPressure(std::vector<unsigned> &Pressure,

                                Register Reg) const {

    auto PSetIter = MRI.getPressureSets(Reg);

    unsigned Weight = PSetIter.getWeight();

    for (; PSetIter.isValid(); ++PSetIter) {

      auto &P = Pressure[*PSetIter];

      assert(P >= Weight &&

             "register pressure must be greater than or equal weight");

      P -= Weight;

    }

  }


  // Return true if Reg is reserved one, for example, stack pointer

  bool isReservedRegister(Register Reg) const {

    return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());

  }


  bool isDefinedInThisLoop(Register Reg) const {

    return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;

  }


  // Search for live-in variables. They are factored into the register pressure

  // from the begining. Live-in variables used by every iteration should be

  // considered as alive throughout the loop. For example, the variable `c` in

  // following code. \code

  //   int c = ...;

  //   for (int i = 0; i < n; i++)

  //     a[i] += b[i] + c;

  // \endcode

  void computeLiveIn() {

    DenseSet<Register> Used;

    for (auto &MI : *OrigMBB) {

      if (MI.isDebugInstr())

        continue;

      for (auto &Use : ROMap[&MI].Uses) {

        auto Reg = Use.RegUnit;

        // Ignore the variable that appears only on one side of phi instruction

        // because it's used only at the first iteration.

        if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))

          continue;

        if (isReservedRegister(Reg))

          continue;

        if (isDefinedInThisLoop(Reg))

          continue;

        Used.insert(Reg);

      }

    }


    for (auto LiveIn : Used)

      increaseRegisterPressure(InitSetPressure, LiveIn);

  }


  // Calculate the upper limit of each pressure set

  void computePressureSetLimit(const RegisterClassInfo &RCI) {

    for (unsigned PSet = 0; PSet < PSetNum; PSet++)

      PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);

  }


  // There are two patterns of last-use.

  //   - by an instruction of the current iteration

  //   - by a phi instruction of the next iteration (loop carried value)

  //

  // Furthermore, following two groups of instructions are executed

  // simultaneously

  //   - next iteration's phi instructions in i-th stage

  //   - current iteration's instructions in i+1-th stage

  //

  // This function calculates the last-use of each register while taking into

  // account the above two patterns.

  Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,

                                   Instr2StageTy &Stages) const {

    // We treat virtual registers that are defined and used in this loop.

    // Following virtual register will be ignored

    //   - live-in one

    //   - defined but not used in the loop (potentially live-out)

    DenseSet<Register> TargetRegs;

    const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {

      if (isDefinedInThisLoop(Reg))

        TargetRegs.insert(Reg);

    };

    for (MachineInstr *MI : OrderedInsts) {

      if (MI->isPHI()) {

        Register Reg = getLoopPhiReg(*MI, OrigMBB);

        UpdateTargetRegs(Reg);

      } else {

        for (auto &Use : ROMap.find(MI)->getSecond().Uses)

          UpdateTargetRegs(Use.RegUnit);

      }

    }


    const auto InstrScore = [&Stages](MachineInstr *MI) {

      return Stages[MI] + MI->isPHI();

    };


    DenseMap<Register, MachineInstr *> LastUseMI;

    for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {

      for (auto &Use : ROMap.find(MI)->getSecond().Uses) {

        auto Reg = Use.RegUnit;

        if (!TargetRegs.contains(Reg))

          continue;

        auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);

        if (!Inserted) {

          MachineInstr *Orig = Ite->second;

          MachineInstr *New = MI;

          if (InstrScore(Orig) < InstrScore(New))

            Ite->second = New;

        }

      }

    }


    Instr2LastUsesTy LastUses;

    for (auto &Entry : LastUseMI)

      LastUses[Entry.second].insert(Entry.first);

    return LastUses;

  }


  // Compute the maximum register pressure of the kernel. We'll simulate #Stage

  // iterations and check the register pressure at the point where all stages

  // overlapping.

  //

  // An example of unrolled loop where #Stage is 4..

  // Iter   i+0 i+1 i+2 i+3

  // ------------------------

  // Stage   0

  // Stage   1   0

  // Stage   2   1   0

  // Stage   3   2   1   0  <- All stages overlap

  //

  std::vector<unsigned>

  computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,

                        Instr2StageTy &Stages,

                        const unsigned StageCount) const {

    using RegSetTy = SmallDenseSet<Register, 16>;


    // Indexed by #Iter. To treat "local" variables of each stage separately, we

    // manage the liveness of the registers independently by iterations.

    SmallVector<RegSetTy> LiveRegSets(StageCount);


    auto CurSetPressure = InitSetPressure;

    auto MaxSetPressure = InitSetPressure;

    auto LastUses = computeLastUses(OrderedInsts, Stages);


    LLVM_DEBUG({

      dbgs() << "Ordered instructions:\n";

      for (MachineInstr *MI : OrderedInsts) {

        dbgs() << "Stage " << Stages[MI] << ": ";

        MI->dump();

      }

    });


    const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,

                                                   Register Reg) {

      if (!Reg.isValid() || isReservedRegister(Reg))

        return;


      bool Inserted = RegSet.insert(Reg).second;

      if (!Inserted)

        return;


      LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");

      increaseRegisterPressure(CurSetPressure, Reg);

      LLVM_DEBUG(dumpPSet(Reg));

    };


    const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,

                                                  Register Reg) {

      if (!Reg.isValid() || isReservedRegister(Reg))

        return;


      // live-in register

      if (!RegSet.contains(Reg))

        return;


      LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");

      RegSet.erase(Reg);

      decreaseRegisterPressure(CurSetPressure, Reg);

      LLVM_DEBUG(dumpPSet(Reg));

    };


    for (unsigned I = 0; I < StageCount; I++) {

      for (MachineInstr *MI : OrderedInsts) {

        const auto Stage = Stages[MI];

        if (I < Stage)

          continue;


        const unsigned Iter = I - Stage;


        for (auto &Def : ROMap.find(MI)->getSecond().Defs)

          InsertReg(LiveRegSets[Iter], Def.RegUnit);


        for (auto LastUse : LastUses[MI]) {

          if (MI->isPHI()) {

            if (Iter != 0)

              EraseReg(LiveRegSets[Iter - 1], LastUse);

          } else {

            EraseReg(LiveRegSets[Iter], LastUse);

          }

        }


        for (unsigned PSet = 0; PSet < PSetNum; PSet++)

          MaxSetPressure[PSet] =

              std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);


        LLVM_DEBUG({

          dbgs() << "CurSetPressure=";

          dumpRegisterPressures(CurSetPressure);

          dbgs() << " iter=" << Iter << " stage=" << Stage << ":";

          MI->dump();

        });

      }

    }


    return MaxSetPressure;

  }


public:

  HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,

                               const MachineFunction &MF)

      : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),

        TRI(MF.getSubtarget().getRegisterInfo()),

        PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),

        PressureSetLimit(PSetNum, 0) {}


  // Used to calculate register pressure, which is independent of loop

  // scheduling.

  void init(const RegisterClassInfo &RCI) {

    for (MachineInstr &MI : *OrigMBB) {

      if (MI.isDebugInstr())

        continue;

      ROMap[&MI].collect(MI, *TRI, MRI, false, true);

    }


    computeLiveIn();

    computePressureSetLimit(RCI);

  }


  // Calculate the maximum register pressures of the loop and check if they

  // exceed the limit

  bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,

              const unsigned MaxStage) const {

    assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&

           "the percentage of the margin must be between 0 to 100");


    OrderedInstsTy OrderedInsts;

    Instr2StageTy Stages;

    computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);

    const auto MaxSetPressure =

        computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);


    LLVM_DEBUG({

      dbgs() << "Dump MaxSetPressure:\n";

      for (unsigned I = 0; I < MaxSetPressure.size(); I++) {

        dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);

      }

      dbgs() << '\n';

    });


    for (unsigned PSet = 0; PSet < PSetNum; PSet++) {

      unsigned Limit = PressureSetLimit[PSet];

      unsigned Margin = Limit * RegPressureMargin / 100;

      LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit

                        << " Margin=" << Margin << "\n");

      if (Limit < MaxSetPressure[PSet] + Margin) {

        LLVM_DEBUG(

            dbgs()

            << "Rejected the schedule because of too high register pressure\n");

        return true;

      }

    }

    return false;

  }

};


} // end anonymous namespace


/// Calculate the resource constrained minimum initiation interval for the

/// specified loop. We use the DFA to model the resources needed for

/// each instruction, and we ignore dependences. A different DFA is created

/// for each cycle that is required. When adding a new instruction, we attempt

/// to add it to each existing DFA, until a legal space is found. If the

/// instruction cannot be reserved in an existing DFA, we create a new one.

unsigned SwingSchedulerDAG::calculateResMII() {

  LLVM_DEBUG(dbgs() << "calculateResMII:\n");

  ResourceManager RM(&MF.getSubtarget(), this);

  return RM.calculateResMII();

}


/// Calculate the recurrence-constrainted minimum initiation interval.

/// Iterate over each circuit.  Compute the delay(c) and distance(c)

/// for each circuit. The II needs to satisfy the inequality

/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest

/// II that satisfies the inequality, and the RecMII is the maximum

/// of those values.

unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {

  unsigned RecMII = 0;


  for (NodeSet &Nodes : NodeSets) {

    if (Nodes.empty())

      continue;


    unsigned Delay = Nodes.getLatency();

    unsigned Distance = 1;


    // ii = ceil(delay / distance)

    unsigned CurMII = (Delay + Distance - 1) / Distance;

    Nodes.setRecMII(CurMII);

    if (CurMII > RecMII)

      RecMII = CurMII;

  }


  return RecMII;

}


/// Create the adjacency structure of the nodes in the graph.

void SwingSchedulerDAG::Circuits::createAdjacencyStructure(

    SwingSchedulerDAG *DAG) {

  BitVector Added(SUnits.size());

  DenseMap<int, int> OutputDeps;

  for (int i = 0, e = SUnits.size(); i != e; ++i) {

    Added.reset();

    // Add any successor to the adjacency matrix and exclude duplicates.

    for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {

      // Only create a back-edge on the first and last nodes of a dependence

      // chain. This records any chains and adds them later.

      if (OE.isOutputDep()) {

        int N = OE.getDst()->NodeNum;

        int BackEdge = i;

        auto Dep = OutputDeps.find(BackEdge);

        if (Dep != OutputDeps.end()) {

          BackEdge = Dep->second;

          OutputDeps.erase(Dep);

        }

        OutputDeps[N] = BackEdge;

      }

      // Do not process a boundary node, an artificial node.

      if (OE.getDst()->isBoundaryNode() || OE.isArtificial())

        continue;


      // This code is retained o preserve previous behavior and prevent

      // regression. This condition means that anti-dependnecies within an

      // iteration are ignored when searching circuits. Therefore it's natural

      // to consider this dependence as well.

      // FIXME: Remove this code if it doesn't have significant impact on

      // performance.

      if (OE.isAntiDep())

        continue;


      int N = OE.getDst()->NodeNum;

      if (!Added.test(N)) {

        AdjK[i].push_back(N);

        Added.set(N);

      }

    }

    // A chain edge between a store and a load is treated as a back-edge in the

    // adjacency matrix.

    for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {

      SUnit *Src = IE.getSrc();

      SUnit *Dst = IE.getDst();

      if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))

        continue;

      if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {

        int N = Src->NodeNum;

        if (!Added.test(N)) {

          AdjK[i].push_back(N);

          Added.set(N);

        }

      }

    }

  }

  // Add back-edges in the adjacency matrix for the output dependences.

  for (auto &OD : OutputDeps)

    if (!Added.test(OD.second)) {

      AdjK[OD.first].push_back(OD.second);

      Added.set(OD.second);

    }

}


/// Identify an elementary circuit in the dependence graph starting at the

/// specified node.

bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,

                                          const SwingSchedulerDAG *DAG,

                                          bool HasBackedge) {

  SUnit *SV = &SUnits[V];

  bool F = false;

  Stack.insert(SV);

  Blocked.set(V);


  for (auto W : AdjK[V]) {

    if (NumPaths > MaxPaths)

      break;

    if (W < S)

      continue;

    if (W == S) {

      if (!HasBackedge)

        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));

      F = true;

      ++NumPaths;

      break;

    }

    if (!Blocked.test(W)) {

      if (circuit(W, S, NodeSets, DAG,

                  Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))

        F = true;

    }

  }


  if (F)

    unblock(V);

  else {

    for (auto W : AdjK[V]) {

      if (W < S)

        continue;

      B[W].insert(SV);

    }

  }

  Stack.pop_back();

  return F;

}


/// Unblock a node in the circuit finding algorithm.

void SwingSchedulerDAG::Circuits::unblock(int U) {

  Blocked.reset(U);

  SmallPtrSet<SUnit *, 4> &BU = B[U];

  while (!BU.empty()) {

    SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();

    assert(SI != BU.end() && "Invalid B set.");

    SUnit *W = *SI;

    BU.erase(W);

    if (Blocked.test(W->NodeNum))

      unblock(W->NodeNum);

  }

}


/// Identify all the elementary circuits in the dependence graph using

/// Johnson's circuit algorithm.

void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {

  Circuits Cir(SUnits, Topo);

  // Create the adjacency structure.

  Cir.createAdjacencyStructure(this);

  for (int I = 0, E = SUnits.size(); I != E; ++I) {

    Cir.reset();

    Cir.circuit(I, I, NodeSets, this);

  }

}


// Create artificial dependencies between the source of COPY/REG_SEQUENCE that

// is loop-carried to the USE in next iteration. This will help pipeliner avoid

// additional copies that are needed across iterations. An artificial dependence

// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.


// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)

// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE

// PHI-------True-Dep------> USEOfPhi


// The mutation creates

// USEOfPHI -------Artificial-Dep---> SRCOfCopy


// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy

// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled

// late  to avoid additional copies across iterations. The possible scheduling

// order would be

// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.


void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {

  for (SUnit &SU : DAG->SUnits) {

    // Find the COPY/REG_SEQUENCE instruction.

    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())

      continue;


    // Record the loop carried PHIs.

    SmallVector<SUnit *, 4> PHISUs;

    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.

    SmallVector<SUnit *, 4> SrcSUs;


    for (auto &Dep : SU.Preds) {

      SUnit *TmpSU = Dep.getSUnit();

      MachineInstr *TmpMI = TmpSU->getInstr();

      SDep::Kind DepKind = Dep.getKind();

      // Save the loop carried PHI.

      if (DepKind == SDep::Anti && TmpMI->isPHI())

        PHISUs.push_back(TmpSU);

      // Save the source of COPY/REG_SEQUENCE.

      // If the source has no pre-decessors, we will end up creating cycles.

      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)

        SrcSUs.push_back(TmpSU);

    }


    if (PHISUs.size() == 0 || SrcSUs.size() == 0)

      continue;


    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this

    // SUnit to the container.

    SmallVector<SUnit *, 8> UseSUs;

    // Do not use iterator based loop here as we are updating the container.

    for (size_t Index = 0; Index < PHISUs.size(); ++Index) {

      for (auto &Dep : PHISUs[Index]->Succs) {

        if (Dep.getKind() != SDep::Data)

          continue;


        SUnit *TmpSU = Dep.getSUnit();

        MachineInstr *TmpMI = TmpSU->getInstr();

        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {

          PHISUs.push_back(TmpSU);

          continue;

        }

        UseSUs.push_back(TmpSU);

      }

    }


    if (UseSUs.size() == 0)

      continue;


    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);

    // Add the artificial dependencies if it does not form a cycle.

    for (auto *I : UseSUs) {

      for (auto *Src : SrcSUs) {

        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {

          Src->addPred(SDep(I, SDep::Artificial));

          SDAG->Topo.AddPred(Src, I);

        }

      }

    }

  }

}


/// Compute several functions need to order the nodes for scheduling.

///  ASAP - Earliest time to schedule a node.

///  ALAP - Latest time to schedule a node.

///  MOV - Mobility function, difference between ALAP and ASAP.

///  D - Depth of each node.

///  H - Height of each node.

void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {

  ScheduleInfo.resize(SUnits.size());


  LLVM_DEBUG({

    for (int I : Topo) {

      const SUnit &SU = SUnits[I];

      dumpNode(SU);

    }

  });


  int maxASAP = 0;

  // Compute ASAP and ZeroLatencyDepth.

  for (int I : Topo) {

    int asap = 0;

    int zeroLatencyDepth = 0;

    SUnit *SU = &SUnits[I];

    for (const auto &IE : DDG->getInEdges(SU)) {

      SUnit *Pred = IE.getSrc();

      if (IE.getLatency() == 0)

        zeroLatencyDepth =

            std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);

      if (IE.ignoreDependence(true))

        continue;

      asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -

                                  IE.getDistance() * MII));

    }

    maxASAP = std::max(maxASAP, asap);

    ScheduleInfo[I].ASAP = asap;

    ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;

  }


  // Compute ALAP, ZeroLatencyHeight, and MOV.

  for (int I : llvm::reverse(Topo)) {

    int alap = maxASAP;

    int zeroLatencyHeight = 0;

    SUnit *SU = &SUnits[I];

    for (const auto &OE : DDG->getOutEdges(SU)) {

      SUnit *Succ = OE.getDst();

      if (Succ->isBoundaryNode())

        continue;

      if (OE.getLatency() == 0)

        zeroLatencyHeight =

            std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);

      if (OE.ignoreDependence(true))

        continue;

      alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +

                                  OE.getDistance() * MII));

    }


    ScheduleInfo[I].ALAP = alap;

    ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;

  }


  // After computing the node functions, compute the summary for each node set.

  for (NodeSet &I : NodeSets)

    I.computeNodeSetInfo(this);


  LLVM_DEBUG({

    for (unsigned i = 0; i < SUnits.size(); i++) {

      dbgs() << "\tNode " << i << ":\n";

      dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";

      dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";

      dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";

      dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";

      dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";

      dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";

      dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";

    }

  });

}


/// Compute the Pred_L(O) set, as defined in the paper. The set is defined

/// as the predecessors of the elements of NodeOrder that are not also in

/// NodeOrder.


static bool pred_L(SetVector<SUnit *> &NodeOrder,

                   SmallSetVector<SUnit *, 8> &Preds, SwingSchedulerDDG *DDG,

                   const NodeSet *S = nullptr) {

  Preds.clear();


  for (SUnit *SU : NodeOrder) {

    for (const auto &IE : DDG->getInEdges(SU)) {

      SUnit *PredSU = IE.getSrc();

      if (S && S->count(PredSU) == 0)

        continue;

      if (IE.ignoreDependence(true))

        continue;

      if (NodeOrder.count(PredSU) == 0)

        Preds.insert(PredSU);

    }


    // FIXME: The following loop-carried dependencies may also need to be

    // considered.

    //   - Physical register dependencies (true-dependence and WAW).

    //   - Memory dependencies.

    for (const auto &OE : DDG->getOutEdges(SU)) {

      SUnit *SuccSU = OE.getDst();

      if (!OE.isAntiDep())

        continue;

      if (S && S->count(SuccSU) == 0)

        continue;

      if (NodeOrder.count(SuccSU) == 0)

        Preds.insert(SuccSU);

    }

  }

  return !Preds.empty();

}


/// Compute the Succ_L(O) set, as defined in the paper. The set is defined

/// as the successors of the elements of NodeOrder that are not also in

/// NodeOrder.


static bool succ_L(SetVector<SUnit *> &NodeOrder,

                   SmallSetVector<SUnit *, 8> &Succs, SwingSchedulerDDG *DDG,

                   const NodeSet *S = nullptr) {

  Succs.clear();


  for (SUnit *SU : NodeOrder) {

    for (const auto &OE : DDG->getOutEdges(SU)) {

      SUnit *SuccSU = OE.getDst();

      if (S && S->count(SuccSU) == 0)

        continue;

      if (OE.ignoreDependence(false))

        continue;

      if (NodeOrder.count(SuccSU) == 0)

        Succs.insert(SuccSU);

    }


    // FIXME: The following loop-carried dependencies may also need to be

    // considered.

    //   - Physical register dependnecies (true-dependnece and WAW).

    //   - Memory dependencies.

    for (const auto &IE : DDG->getInEdges(SU)) {

      SUnit *PredSU = IE.getSrc();

      if (!IE.isAntiDep())

        continue;

      if (S && S->count(PredSU) == 0)

        continue;

      if (NodeOrder.count(PredSU) == 0)

        Succs.insert(PredSU);

    }

  }

  return !Succs.empty();

}


/// Return true if there is a path from the specified node to any of the nodes

/// in DestNodes. Keep track and return the nodes in any path.


static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,

                        SetVector<SUnit *> &DestNodes,

                        SetVector<SUnit *> &Exclude,

                        SmallPtrSet<SUnit *, 8> &Visited,

                        SwingSchedulerDDG *DDG) {

  if (Cur->isBoundaryNode())

    return false;

  if (Exclude.contains(Cur))

    return false;

  if (DestNodes.contains(Cur))

    return true;

  if (!Visited.insert(Cur).second)

    return Path.contains(Cur);

  bool FoundPath = false;

  for (const auto &OE : DDG->getOutEdges(Cur))

    if (!OE.ignoreDependence(false))

      FoundPath |=

          computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);

  for (const auto &IE : DDG->getInEdges(Cur))

    if (IE.isAntiDep() && IE.getDistance() == 0)

      FoundPath |=

          computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);

  if (FoundPath)

    Path.insert(Cur);

  return FoundPath;

}


/// Compute the live-out registers for the instructions in a node-set.

/// The live-out registers are those that are defined in the node-set,

/// but not used. Except for use operands of Phis.


static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,

                            NodeSet &NS) {

  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

  MachineRegisterInfo &MRI = MF.getRegInfo();

  SmallVector<VRegMaskOrUnit, 8> LiveOutRegs;

  SmallSet<Register, 4> Uses;

  for (SUnit *SU : NS) {

    const MachineInstr *MI = SU->getInstr();

    if (MI->isPHI())

      continue;

    for (const MachineOperand &MO : MI->all_uses()) {

      Register Reg = MO.getReg();

      if (Reg.isVirtual())

        Uses.insert(Reg);

      else if (MRI.isAllocatable(Reg))

        Uses.insert_range(TRI->regunits(Reg.asMCReg()));

    }

  }

  for (SUnit *SU : NS)

    for (const MachineOperand &MO : SU->getInstr()->all_defs())

      if (!MO.isDead()) {

        Register Reg = MO.getReg();

        if (Reg.isVirtual()) {

          if (!Uses.count(Reg))

            LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());

        } else if (MRI.isAllocatable(Reg)) {

          for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))

            if (!Uses.count(Unit))

              LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());

        }

      }

  RPTracker.addLiveRegs(LiveOutRegs);

}


/// A heuristic to filter nodes in recurrent node-sets if the register

/// pressure of a set is too high.

void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {

  for (auto &NS : NodeSets) {

    // Skip small node-sets since they won't cause register pressure problems.

    if (NS.size() <= 2)

      continue;

    IntervalPressure RecRegPressure;

    RegPressureTracker RecRPTracker(RecRegPressure);

    RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);

    computeLiveOuts(MF, RecRPTracker, NS);

    RecRPTracker.closeBottom();


    std::vector<SUnit *> SUnits(NS.begin(), NS.end());

    llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {

      return A->NodeNum > B->NodeNum;

    });


    for (auto &SU : SUnits) {

      // Since we're computing the register pressure for a subset of the

      // instructions in a block, we need to set the tracker for each

      // instruction in the node-set. The tracker is set to the instruction

      // just after the one we're interested in.

      MachineBasicBlock::const_iterator CurInstI = SU->getInstr();

      RecRPTracker.setPos(std::next(CurInstI));


      RegPressureDelta RPDelta;

      ArrayRef<PressureChange> CriticalPSets;

      RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,

                                             CriticalPSets,

                                             RecRegPressure.MaxSetPressure);

      if (RPDelta.Excess.isValid()) {

        LLVM_DEBUG(

            dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "

                   << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())

                   << ":" << RPDelta.Excess.getUnitInc() << "\n");

        NS.setExceedPressure(SU);

        break;

      }

      RecRPTracker.recede();

    }

  }

}


/// A heuristic to colocate node sets that have the same set of

/// successors.

void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {

  unsigned Colocate = 0;

  for (int i = 0, e = NodeSets.size(); i < e; ++i) {

    NodeSet &N1 = NodeSets[i];

    SmallSetVector<SUnit *, 8> S1;

    if (N1.empty() || !succ_L(N1, S1, DDG.get()))

      continue;

    for (int j = i + 1; j < e; ++j) {

      NodeSet &N2 = NodeSets[j];

      if (N1.compareRecMII(N2) != 0)

        continue;

      SmallSetVector<SUnit *, 8> S2;

      if (N2.empty() || !succ_L(N2, S2, DDG.get()))

        continue;

      if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {

        N1.setColocate(++Colocate);

        N2.setColocate(Colocate);

        break;

      }

    }

  }

}


/// Check if the existing node-sets are profitable. If not, then ignore the

/// recurrent node-sets, and attempt to schedule all nodes together. This is

/// a heuristic. If the MII is large and all the recurrent node-sets are small,

/// then it's best to try to schedule all instructions together instead of

/// starting with the recurrent node-sets.

void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {

  // Look for loops with a large MII.

  if (MII < 17)

    return;

  // Check if the node-set contains only a simple add recurrence.

  for (auto &NS : NodeSets) {

    if (NS.getRecMII() > 2)

      return;

    if (NS.getMaxDepth() > MII)

      return;

  }

  NodeSets.clear();

  LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");

}


/// Add the nodes that do not belong to a recurrence set into groups

/// based upon connected components.

void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {

  SetVector<SUnit *> NodesAdded;

  SmallPtrSet<SUnit *, 8> Visited;

  // Add the nodes that are on a path between the previous node sets and

  // the current node set.

  for (NodeSet &I : NodeSets) {

    SmallSetVector<SUnit *, 8> N;

    // Add the nodes from the current node set to the previous node set.

    if (succ_L(I, N, DDG.get())) {

      SetVector<SUnit *> Path;

      for (SUnit *NI : N) {

        Visited.clear();

        computePath(NI, Path, NodesAdded, I, Visited, DDG.get());

      }

      if (!Path.empty())

        I.insert(Path.begin(), Path.end());

    }

    // Add the nodes from the previous node set to the current node set.

    N.clear();

    if (succ_L(NodesAdded, N, DDG.get())) {

      SetVector<SUnit *> Path;

      for (SUnit *NI : N) {

        Visited.clear();

        computePath(NI, Path, I, NodesAdded, Visited, DDG.get());

      }

      if (!Path.empty())

        I.insert(Path.begin(), Path.end());

    }

    NodesAdded.insert_range(I);

  }


  // Create a new node set with the connected nodes of any successor of a node

  // in a recurrent set.

  NodeSet NewSet;

  SmallSetVector<SUnit *, 8> N;

  if (succ_L(NodesAdded, N, DDG.get()))

    for (SUnit *I : N)

      addConnectedNodes(I, NewSet, NodesAdded);

  if (!NewSet.empty())

    NodeSets.push_back(NewSet);


  // Create a new node set with the connected nodes of any predecessor of a node

  // in a recurrent set.

  NewSet.clear();

  if (pred_L(NodesAdded, N, DDG.get()))

    for (SUnit *I : N)

      addConnectedNodes(I, NewSet, NodesAdded);

  if (!NewSet.empty())

    NodeSets.push_back(NewSet);


  // Create new nodes sets with the connected nodes any remaining node that

  // has no predecessor.

  for (SUnit &SU : SUnits) {

    if (NodesAdded.count(&SU) == 0) {

      NewSet.clear();

      addConnectedNodes(&SU, NewSet, NodesAdded);

      if (!NewSet.empty())

        NodeSets.push_back(NewSet);

    }

  }

}


/// Add the node to the set, and add all of its connected nodes to the set.

void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,

                                          SetVector<SUnit *> &NodesAdded) {

  NewSet.insert(SU);

  NodesAdded.insert(SU);

  for (auto &OE : DDG->getOutEdges(SU)) {

    SUnit *Successor = OE.getDst();

    if (!OE.isArtificial() && !Successor->isBoundaryNode() &&

        NodesAdded.count(Successor) == 0)

      addConnectedNodes(Successor, NewSet, NodesAdded);

  }

  for (auto &IE : DDG->getInEdges(SU)) {

    SUnit *Predecessor = IE.getSrc();

    if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)

      addConnectedNodes(Predecessor, NewSet, NodesAdded);

  }

}


/// Return true if Set1 contains elements in Set2. The elements in common

/// are returned in a different container.


static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,

                        SmallSetVector<SUnit *, 8> &Result) {

  Result.clear();

  for (SUnit *SU : Set1) {

    if (Set2.count(SU) != 0)

      Result.insert(SU);

  }

  return !Result.empty();

}


/// Merge the recurrence node sets that have the same initial node.

void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {

  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;

       ++I) {

    NodeSet &NI = *I;

    for (NodeSetType::iterator J = I + 1; J != E;) {

      NodeSet &NJ = *J;

      if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {

        if (NJ.compareRecMII(NI) > 0)

          NI.setRecMII(NJ.getRecMII());

        for (SUnit *SU : *J)

          I->insert(SU);

        NodeSets.erase(J);

        E = NodeSets.end();

      } else {

        ++J;

      }

    }

  }

}


/// Remove nodes that have been scheduled in previous NodeSets.

void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {

  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;

       ++I)

    for (NodeSetType::iterator J = I + 1; J != E;) {

      J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });


      if (J->empty()) {

        NodeSets.erase(J);

        E = NodeSets.end();

      } else {

        ++J;

      }

    }

}


/// Compute an ordered list of the dependence graph nodes, which

/// indicates the order that the nodes will be scheduled.  This is a

/// two-level algorithm. First, a partial order is created, which

/// consists of a list of sets ordered from highest to lowest priority.

void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {

  SmallSetVector<SUnit *, 8> R;

  NodeOrder.clear();


  for (auto &Nodes : NodeSets) {

    LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");

    OrderKind Order;

    SmallSetVector<SUnit *, 8> N;

    if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {

      R.insert_range(N);

      Order = BottomUp;

      LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");

    } else if (succ_L(NodeOrder, N, DDG.get()) &&

               llvm::set_is_subset(N, Nodes)) {

      R.insert_range(N);

      Order = TopDown;

      LLVM_DEBUG(dbgs() << "  Top down (succs) ");

    } else if (isIntersect(N, Nodes, R)) {

      // If some of the successors are in the existing node-set, then use the

      // top-down ordering.

      Order = TopDown;

      LLVM_DEBUG(dbgs() << "  Top down (intersect) ");

    } else if (NodeSets.size() == 1) {

      for (const auto &N : Nodes)

        if (N->Succs.size() == 0)

          R.insert(N);

      Order = BottomUp;

      LLVM_DEBUG(dbgs() << "  Bottom up (all) ");

    } else {

      // Find the node with the highest ASAP.

      SUnit *maxASAP = nullptr;

      for (SUnit *SU : Nodes) {

        if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||

            (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))

          maxASAP = SU;

      }

      R.insert(maxASAP);

      Order = BottomUp;

      LLVM_DEBUG(dbgs() << "  Bottom up (default) ");

    }


    while (!R.empty()) {

      if (Order == TopDown) {

        // Choose the node with the maximum height.  If more than one, choose

        // the node wiTH the maximum ZeroLatencyHeight. If still more than one,

        // choose the node with the lowest MOV.

        while (!R.empty()) {

          SUnit *maxHeight = nullptr;

          for (SUnit *I : R) {

            if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))

              maxHeight = I;

            else if (getHeight(I) == getHeight(maxHeight) &&

                     getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))

              maxHeight = I;

            else if (getHeight(I) == getHeight(maxHeight) &&

                     getZeroLatencyHeight(I) ==

                         getZeroLatencyHeight(maxHeight) &&

                     getMOV(I) < getMOV(maxHeight))

              maxHeight = I;

          }

          NodeOrder.insert(maxHeight);

          LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");

          R.remove(maxHeight);

          for (const auto &OE : DDG->getOutEdges(maxHeight)) {

            SUnit *SU = OE.getDst();

            if (Nodes.count(SU) == 0)

              continue;

            if (NodeOrder.contains(SU))

              continue;

            if (OE.ignoreDependence(false))

              continue;

            R.insert(SU);

          }


          // FIXME: The following loop-carried dependencies may also need to be

          // considered.

          //   - Physical register dependnecies (true-dependnece and WAW).

          //   - Memory dependencies.

          for (const auto &IE : DDG->getInEdges(maxHeight)) {

            SUnit *SU = IE.getSrc();

            if (!IE.isAntiDep())

              continue;

            if (Nodes.count(SU) == 0)

              continue;

            if (NodeOrder.contains(SU))

              continue;

            R.insert(SU);

          }

        }

        Order = BottomUp;

        LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");

        SmallSetVector<SUnit *, 8> N;

        if (pred_L(NodeOrder, N, DDG.get(), &Nodes))

          R.insert_range(N);

      } else {

        // Choose the node with the maximum depth.  If more than one, choose

        // the node with the maximum ZeroLatencyDepth. If still more than one,

        // choose the node with the lowest MOV.

        while (!R.empty()) {

          SUnit *maxDepth = nullptr;

          for (SUnit *I : R) {

            if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))

              maxDepth = I;

            else if (getDepth(I) == getDepth(maxDepth) &&

                     getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))

              maxDepth = I;

            else if (getDepth(I) == getDepth(maxDepth) &&

                     getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&

                     getMOV(I) < getMOV(maxDepth))

              maxDepth = I;

          }

          NodeOrder.insert(maxDepth);

          LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");

          R.remove(maxDepth);

          if (Nodes.isExceedSU(maxDepth)) {

            Order = TopDown;

            R.clear();

            R.insert(Nodes.getNode(0));

            break;

          }

          for (const auto &IE : DDG->getInEdges(maxDepth)) {

            SUnit *SU = IE.getSrc();

            if (Nodes.count(SU) == 0)

              continue;

            if (NodeOrder.contains(SU))

              continue;

            R.insert(SU);

          }


          // FIXME: The following loop-carried dependencies may also need to be

          // considered.

          //   - Physical register dependnecies (true-dependnece and WAW).

          //   - Memory dependencies.

          for (const auto &OE : DDG->getOutEdges(maxDepth)) {

            SUnit *SU = OE.getDst();

            if (!OE.isAntiDep())

              continue;

            if (Nodes.count(SU) == 0)

              continue;

            if (NodeOrder.contains(SU))

              continue;

            R.insert(SU);

          }

        }

        Order = TopDown;

        LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");

        SmallSetVector<SUnit *, 8> N;

        if (succ_L(NodeOrder, N, DDG.get(), &Nodes))

          R.insert_range(N);

      }

    }

    LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");

  }


  LLVM_DEBUG({

    dbgs() << "Node order: ";

    for (SUnit *I : NodeOrder)

      dbgs() << " " << I->NodeNum << " ";

    dbgs() << "\n";

  });

}


/// Process the nodes in the computed order and create the pipelined schedule

/// of the instructions, if possible. Return true if a schedule is found.

bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {


  if (NodeOrder.empty()){

    LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );

    return false;

  }


  bool scheduleFound = false;

  std::unique_ptr<HighRegisterPressureDetector> HRPDetector;

  if (LimitRegPressure) {

    HRPDetector =

        std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);

    HRPDetector->init(RegClassInfo);

  }

  // Keep increasing II until a valid schedule is found.

  for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {

    Schedule.reset();

    Schedule.setInitiationInterval(II);

    LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");


    SetVector<SUnit *>::iterator NI = NodeOrder.begin();

    SetVector<SUnit *>::iterator NE = NodeOrder.end();

    do {

      SUnit *SU = *NI;


      // Compute the schedule time for the instruction, which is based

      // upon the scheduled time for any predecessors/successors.

      int EarlyStart = INT_MIN;

      int LateStart = INT_MAX;

      Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);

      LLVM_DEBUG({

        dbgs() << "\n";

        dbgs() << "Inst (" << SU->NodeNum << ") ";

        SU->getInstr()->dump();

        dbgs() << "\n";

      });

      LLVM_DEBUG(

          dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));


      if (EarlyStart > LateStart)

        scheduleFound = false;

      else if (EarlyStart != INT_MIN && LateStart == INT_MAX)

        scheduleFound =

            Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);

      else if (EarlyStart == INT_MIN && LateStart != INT_MAX)

        scheduleFound =

            Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);

      else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {

        LateStart = std::min(LateStart, EarlyStart + (int)II - 1);

        // When scheduling a Phi it is better to start at the late cycle and

        // go backwards. The default order may insert the Phi too far away

        // from its first dependence.

        // Also, do backward search when all scheduled predecessors are

        // loop-carried output/order dependencies. Empirically, there are also

        // cases where scheduling becomes possible with backward search.

        if (SU->getInstr()->isPHI() ||

            Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))

          scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);

        else

          scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);

      } else {

        int FirstCycle = Schedule.getFirstCycle();

        scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),

                                        FirstCycle + getASAP(SU) + II - 1, II);

      }


      // Even if we find a schedule, make sure the schedule doesn't exceed the

      // allowable number of stages. We keep trying if this happens.

      if (scheduleFound)

        if (SwpMaxStages > -1 &&

            Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)

          scheduleFound = false;


      LLVM_DEBUG({

        if (!scheduleFound)

          dbgs() << "\tCan't schedule\n";

      });

    } while (++NI != NE && scheduleFound);


    // If a schedule is found, validate it against the validation-only

    // dependencies.

    if (scheduleFound)

      scheduleFound = DDG->isValidSchedule(Schedule);


    // If a schedule is found, ensure non-pipelined instructions are in stage 0

    if (scheduleFound)

      scheduleFound =

          Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);


    // If a schedule is found, check if it is a valid schedule too.

    if (scheduleFound)

      scheduleFound = Schedule.isValidSchedule(this);


    // If a schedule was found and the option is enabled, check if the schedule

    // might generate additional register spills/fills.

    if (scheduleFound && LimitRegPressure)

      scheduleFound =

          !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());

  }


  LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound

                    << " (II=" << Schedule.getInitiationInterval()

                    << ")\n");


  if (scheduleFound) {

    scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);

    if (!scheduleFound)

      LLVM_DEBUG(dbgs() << "Target rejected schedule\n");

  }


  if (scheduleFound) {

    Schedule.finalizeSchedule(this);

    Pass.ORE->emit([&]() {

      return MachineOptimizationRemarkAnalysis(

                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())

             << "Schedule found with Initiation Interval: "

             << ore::NV("II", Schedule.getInitiationInterval())

             << ", MaxStageCount: "

             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());

    });

  } else

    Schedule.reset();


  return scheduleFound && Schedule.getMaxStageCount() > 0;

}


static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI) {

  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

  Register Result;

  for (const MachineOperand &Use : MI.all_uses()) {

    Register Reg = Use.getReg();

    if (!Reg.isVirtual())

      return Register();

    if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())

      continue;

    if (Result)

      return Register();

    Result = Reg;

  }

  return Result;

}


/// When Op is a value that is incremented recursively in a loop and there is a

/// unique instruction that increments it, returns true and sets Value.


static bool findLoopIncrementValue(const MachineOperand &Op, int &Value) {

  if (!Op.isReg() || !Op.getReg().isVirtual())

    return false;


  Register OrgReg = Op.getReg();

  Register CurReg = OrgReg;

  const MachineBasicBlock *LoopBB = Op.getParent()->getParent();

  const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();


  const TargetInstrInfo *TII =

      LoopBB->getParent()->getSubtarget().getInstrInfo();

  const TargetRegisterInfo *TRI =

      LoopBB->getParent()->getSubtarget().getRegisterInfo();


  MachineInstr *Phi = nullptr;

  MachineInstr *Increment = nullptr;


  // Traverse definitions until it reaches Op or an instruction that does not

  // satisfy the condition.

  // Acceptable example:

  //   bb.0:

  //     %0 = PHI %3, %bb.0, ...

  //     %2 = ADD %0, Value

  //     ... = LOAD %2(Op)

  //     %3 = COPY %2

  while (true) {

    if (!CurReg.isValid() || !CurReg.isVirtual())

      return false;

    MachineInstr *Def = MRI.getVRegDef(CurReg);

    if (Def->getParent() != LoopBB)

      return false;


    if (Def->isCopy()) {

      // Ignore copy instructions unless they contain subregisters

      if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())

        return false;

      CurReg = Def->getOperand(1).getReg();

    } else if (Def->isPHI()) {

      // There must be just one Phi

      if (Phi)

        return false;

      Phi = Def;

      CurReg = getLoopPhiReg(*Def, LoopBB);

    } else if (TII->getIncrementValue(*Def, Value)) {

      // Potentially a unique increment

      if (Increment)

        // Multiple increments exist

        return false;


      const MachineOperand *BaseOp;

      int64_t Offset;

      bool OffsetIsScalable;

      if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,

                                       TRI)) {

        // Pre/post increment instruction

        CurReg = BaseOp->getReg();

      } else {

        // If only one of the operands is defined within the loop, it is assumed

        // to be an incremented value.

        CurReg = findUniqueOperandDefinedInLoop(*Def);

        if (!CurReg.isValid())

          return false;

      }

      Increment = Def;

    } else {

      return false;

    }

    if (CurReg == OrgReg)

      break;

  }


  if (!Phi || !Increment)

    return false;


  return true;

}


/// Return true if we can compute the amount the instruction changes

/// during each iteration. Set Delta to the amount of the change.

bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {

  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

  const MachineOperand *BaseOp;

  int64_t Offset;

  bool OffsetIsScalable;

  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))

    return false;


  // FIXME: This algorithm assumes instructions have fixed-size offsets.

  if (OffsetIsScalable)

    return false;


  if (!BaseOp->isReg())

    return false;


  return findLoopIncrementValue(*BaseOp, Delta);

}


/// Check if we can change the instruction to use an offset value from the

/// previous iteration. If so, return true and set the base and offset values

/// so that we can rewrite the load, if necessary.

///   v1 = Phi(v0, v3)

///   v2 = load v1, 0

///   v3 = post_store v1, 4, x

/// This function enables the load to be rewritten as v2 = load v3, 4.

bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,

                                              unsigned &BasePos,

                                              unsigned &OffsetPos,

                                              Register &NewBase,

                                              int64_t &Offset) {

  // Get the load instruction.

  if (TII->isPostIncrement(*MI))

    return false;

  unsigned BasePosLd, OffsetPosLd;

  if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))

    return false;

  Register BaseReg = MI->getOperand(BasePosLd).getReg();


  // Look for the Phi instruction.

  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();

  MachineInstr *Phi = MRI.getVRegDef(BaseReg);

  if (!Phi || !Phi->isPHI())

    return false;

  // Get the register defined in the loop block.

  Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());

  if (!PrevReg)

    return false;


  // Check for the post-increment load/store instruction.

  MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);

  if (!PrevDef || PrevDef == MI)

    return false;


  if (!TII->isPostIncrement(*PrevDef))

    return false;


  unsigned BasePos1 = 0, OffsetPos1 = 0;

  if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))

    return false;


  // Make sure that the instructions do not access the same memory location in

  // the next iteration.

  int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();

  int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();

  MachineInstr *NewMI = MF.CloneMachineInstr(MI);

  NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);

  bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);

  MF.deleteMachineInstr(NewMI);

  if (!Disjoint)

    return false;


  // Set the return value once we determine that we return true.

  BasePos = BasePosLd;

  OffsetPos = OffsetPosLd;

  NewBase = PrevReg;

  Offset = StoreOffset;

  return true;

}


/// Apply changes to the instruction if needed. The changes are need

/// to improve the scheduling and depend up on the final schedule.


void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,

                                         SMSchedule &Schedule) {

  SUnit *SU = getSUnit(MI);

  DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =

      InstrChanges.find(SU);

  if (It != InstrChanges.end()) {

    std::pair<Register, int64_t> RegAndOffset = It->second;

    unsigned BasePos, OffsetPos;

    if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))

      return;

    Register BaseReg = MI->getOperand(BasePos).getReg();

    MachineInstr *LoopDef = findDefInLoop(BaseReg);

    int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));

    int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));

    int BaseStageNum = Schedule.stageScheduled(SU);

    int BaseCycleNum = Schedule.cycleScheduled(SU);

    if (BaseStageNum < DefStageNum) {

      MachineInstr *NewMI = MF.CloneMachineInstr(MI);

      int OffsetDiff = DefStageNum - BaseStageNum;

      if (DefCycleNum < BaseCycleNum) {

        NewMI->getOperand(BasePos).setReg(RegAndOffset.first);

        if (OffsetDiff > 0)

          --OffsetDiff;

      }

      int64_t NewOffset =

          MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;

      NewMI->getOperand(OffsetPos).setImm(NewOffset);

      SU->setInstr(NewMI);

      MISUnitMap[NewMI] = SU;

      NewMIs[MI] = NewMI;

    }

  }

}


/// Return the instruction in the loop that defines the register.

/// If the definition is a Phi, then follow the Phi operand to

/// the instruction in the loop.

MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {

  SmallPtrSet<MachineInstr *, 8> Visited;

  MachineInstr *Def = MRI.getVRegDef(Reg);

  while (Def->isPHI()) {

    if (!Visited.insert(Def).second)

      break;

    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)

      if (Def->getOperand(i + 1).getMBB() == BB) {

        Def = MRI.getVRegDef(Def->getOperand(i).getReg());

        break;

      }

  }

  return Def;

}


/// Return false if there is no overlap between the region accessed by BaseMI in

/// an iteration and the region accessed by OtherMI in subsequent iterations.


bool SwingSchedulerDAG::mayOverlapInLaterIter(

    const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {

  int DeltaB, DeltaO, Delta;

  if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||

      DeltaB != DeltaO)

    return true;

  Delta = DeltaB;


  const MachineOperand *BaseOpB, *BaseOpO;

  int64_t OffsetB, OffsetO;

  bool OffsetBIsScalable, OffsetOIsScalable;

  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();

  if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,

                                    OffsetBIsScalable, TRI) ||

      !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,

                                    OffsetOIsScalable, TRI))

    return true;


  if (OffsetBIsScalable || OffsetOIsScalable)

    return true;


  if (!BaseOpB->isIdenticalTo(*BaseOpO)) {

    // Pass cases with different base operands but same initial values.

    // Typically for when pre/post increment is used.


    if (!BaseOpB->isReg() || !BaseOpO->isReg())

      return true;

    Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();

    if (!RegB.isVirtual() || !RegO.isVirtual())

      return true;


    MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());

    MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());

    if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())

      return true;


    Register InitValB;

    Register LoopValB;

    Register InitValO;

    Register LoopValO;

    getPhiRegs(*DefB, BB, InitValB, LoopValB);

    getPhiRegs(*DefO, BB, InitValO, LoopValO);

    MachineInstr *InitDefB = MRI.getVRegDef(InitValB);

    MachineInstr *InitDefO = MRI.getVRegDef(InitValO);


    if (!InitDefB->isIdenticalTo(*InitDefO))

      return true;

  }


  LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();

  LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();


  // This is the main test, which checks the offset values and the loop

  // increment value to determine if the accesses may be loop carried.

  if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())

    return true;


  LLVM_DEBUG({

    dbgs() << "Overlap check:\n";

    dbgs() << "  BaseMI: ";

    BaseMI->dump();

    dbgs() << "    Base + " << OffsetB << " + I * " << Delta

           << ", Len: " << AccessSizeB.getValue() << "\n";

    dbgs() << "  OtherMI: ";

    OtherMI->dump();

    dbgs() << "    Base + " << OffsetO << " + I * " << Delta

           << ", Len: " << AccessSizeO.getValue() << "\n";

  });


  // Excessive overlap may be detected in strided patterns.

  // For example, the memory addresses of the store and the load in

  //   for (i=0; i<n; i+=2) a[i+1] = a[i];

  // are assumed to overlap.

  if (Delta < 0) {

    int64_t BaseMinAddr = OffsetB;

    int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;

    if (BaseMinAddr > OhterNextIterMaxAddr) {

      LLVM_DEBUG(dbgs() << "  Result: No overlap\n");

      return false;

    }

  } else {

    int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;

    int64_t OtherNextIterMinAddr = OffsetO + Delta;

    if (BaseMaxAddr < OtherNextIterMinAddr) {

      LLVM_DEBUG(dbgs() << "  Result: No overlap\n");

      return false;

    }

  }

  LLVM_DEBUG(dbgs() << "  Result: Overlap\n");

  return true;

}


/// Return true for an order or output dependence that is loop carried

/// potentially. A dependence is loop carried if the destination defines a value

/// that may be used or defined by the source in a subsequent iteration.


bool SwingSchedulerDAG::isLoopCarriedDep(

    const SwingSchedulerDDGEdge &Edge) const {

  if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||

      Edge.getDst()->isBoundaryNode())

    return false;


  if (!SwpPruneLoopCarried)

    return true;


  if (Edge.isOutputDep())

    return true;


  MachineInstr *SI = Edge.getSrc()->getInstr();

  MachineInstr *DI = Edge.getDst()->getInstr();

  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");


  // Assume ordered loads and stores may have a loop carried dependence.

  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||

      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||

      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())

    return true;


  if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())

    return false;


  // The conservative assumption is that a dependence between memory operations

  // may be loop carried. The following code checks when it can be proved that

  // there is no loop carried dependence.

  return mayOverlapInLaterIter(DI, SI);

}


void SwingSchedulerDAG::postProcessDAG() {

  for (auto &M : Mutations)

    M->apply(this);

}


/// Try to schedule the node at the specified StartCycle and continue

/// until the node is schedule or the EndCycle is reached.  This function

/// returns true if the node is scheduled.  This routine may search either

/// forward or backward for a place to insert the instruction based upon

/// the relative values of StartCycle and EndCycle.


bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {

  bool forward = true;

  LLVM_DEBUG({

    dbgs() << "Trying to insert node between " << StartCycle << " and "

           << EndCycle << " II: " << II << "\n";

  });

  if (StartCycle > EndCycle)

    forward = false;


  // The terminating condition depends on the direction.

  int termCycle = forward ? EndCycle + 1 : EndCycle - 1;

  for (int curCycle = StartCycle; curCycle != termCycle;

       forward ? ++curCycle : --curCycle) {


    if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||

        ProcItinResources.canReserveResources(*SU, curCycle)) {

      LLVM_DEBUG({

        dbgs() << "\tinsert at cycle " << curCycle << " ";

        SU->getInstr()->dump();

      });


      if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))

        ProcItinResources.reserveResources(*SU, curCycle);

      ScheduledInstrs[curCycle].push_back(SU);

      InstrToCycle.insert(std::make_pair(SU, curCycle));

      if (curCycle > LastCycle)

        LastCycle = curCycle;

      if (curCycle < FirstCycle)

        FirstCycle = curCycle;

      return true;

    }

    LLVM_DEBUG({

      dbgs() << "\tfailed to insert at cycle " << curCycle << " ";

      SU->getInstr()->dump();

    });

  }

  return false;

}


// Return the cycle of the earliest scheduled instruction in the chain.


int SMSchedule::earliestCycleInChain(const SwingSchedulerDDGEdge &Dep,

                                     const SwingSchedulerDDG *DDG) {

  SmallPtrSet<SUnit *, 8> Visited;

  SmallVector<SwingSchedulerDDGEdge, 8> Worklist;

  Worklist.push_back(Dep);

  int EarlyCycle = INT_MAX;

  while (!Worklist.empty()) {

    const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();

    SUnit *PrevSU = Cur.getSrc();

    if (Visited.count(PrevSU))

      continue;

    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);

    if (it == InstrToCycle.end())

      continue;

    EarlyCycle = std::min(EarlyCycle, it->second);

    for (const auto &IE : DDG->getInEdges(PrevSU))

      if (IE.isOrderDep() || IE.isOutputDep())

        Worklist.push_back(IE);

    Visited.insert(PrevSU);

  }

  return EarlyCycle;

}


// Return the cycle of the latest scheduled instruction in the chain.


int SMSchedule::latestCycleInChain(const SwingSchedulerDDGEdge &Dep,

                                   const SwingSchedulerDDG *DDG) {

  SmallPtrSet<SUnit *, 8> Visited;

  SmallVector<SwingSchedulerDDGEdge, 8> Worklist;

  Worklist.push_back(Dep);

  int LateCycle = INT_MIN;

  while (!Worklist.empty()) {

    const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();

    SUnit *SuccSU = Cur.getDst();

    if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())

      continue;

    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);

    if (it == InstrToCycle.end())

      continue;

    LateCycle = std::max(LateCycle, it->second);

    for (const auto &OE : DDG->getOutEdges(SuccSU))

      if (OE.isOrderDep() || OE.isOutputDep())

        Worklist.push_back(OE);

    Visited.insert(SuccSU);

  }

  return LateCycle;

}


/// If an instruction has a use that spans multiple iterations, then

/// return true. These instructions are characterized by having a back-ege

/// to a Phi, which contains a reference to another Phi.


static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {

  for (auto &P : SU->Preds)

    if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())

      for (auto &S : P.getSUnit()->Succs)

        if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())

          return P.getSUnit();

  return nullptr;

}


/// Compute the scheduling start slot for the instruction.  The start slot

/// depends on any predecessor or successor nodes scheduled already.


void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,

                              int II, SwingSchedulerDAG *DAG) {

  const SwingSchedulerDDG *DDG = DAG->getDDG();


  // Iterate over each instruction that has been scheduled already.  The start

  // slot computation depends on whether the previously scheduled instruction

  // is a predecessor or successor of the specified instruction.

  for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {

    for (SUnit *I : getInstructions(cycle)) {

      for (const auto &IE : DDG->getInEdges(SU)) {

        if (IE.getSrc() == I) {

          // FIXME: Add reverse edge to `DDG` instead of calling

          // `isLoopCarriedDep`

          if (DAG->isLoopCarriedDep(IE)) {

            int End = earliestCycleInChain(IE, DDG) + (II - 1);

            *MinLateStart = std::min(*MinLateStart, End);

          }

          int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;

          *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);

        }

      }


      for (const auto &OE : DDG->getOutEdges(SU)) {

        if (OE.getDst() == I) {

          // FIXME: Add reverse edge to `DDG` instead of calling

          // `isLoopCarriedDep`

          if (DAG->isLoopCarriedDep(OE)) {

            int Start = latestCycleInChain(OE, DDG) + 1 - II;

            *MaxEarlyStart = std::max(*MaxEarlyStart, Start);

          }

          int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;

          *MinLateStart = std::min(*MinLateStart, LateStart);

        }

      }


      SUnit *BE = multipleIterations(I, DAG);

      for (const auto &Dep : SU->Preds) {

        // For instruction that requires multiple iterations, make sure that

        // the dependent instruction is not scheduled past the definition.

        if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&

            !SU->isPred(I))

          *MinLateStart = std::min(*MinLateStart, cycle);

      }

    }

  }

}


/// Order the instructions within a cycle so that the definitions occur

/// before the uses. Returns true if the instruction is added to the start

/// of the list, or false if added to the end.


void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,

                                 std::deque<SUnit *> &Insts) const {

  MachineInstr *MI = SU->getInstr();

  bool OrderBeforeUse = false;

  bool OrderAfterDef = false;

  bool OrderBeforeDef = false;

  unsigned MoveDef = 0;

  unsigned MoveUse = 0;

  int StageInst1 = stageScheduled(SU);

  const SwingSchedulerDDG *DDG = SSD->getDDG();


  unsigned Pos = 0;

  for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;

       ++I, ++Pos) {

    for (MachineOperand &MO : MI->operands()) {

      if (!MO.isReg() || !MO.getReg().isVirtual())

        continue;


      Register Reg = MO.getReg();

      unsigned BasePos, OffsetPos;

      if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))

        if (MI->getOperand(BasePos).getReg() == Reg)

          if (Register NewReg = SSD->getInstrBaseReg(SU))

            Reg = NewReg;

      bool Reads, Writes;

      std::tie(Reads, Writes) =

          (*I)->getInstr()->readsWritesVirtualRegister(Reg);

      if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {

        OrderBeforeUse = true;

        if (MoveUse == 0)

          MoveUse = Pos;

      } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {

        // Add the instruction after the scheduled instruction.

        OrderAfterDef = true;

        MoveDef = Pos;

      } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {

        if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {

          OrderBeforeUse = true;

          if (MoveUse == 0)

            MoveUse = Pos;

        } else {

          OrderAfterDef = true;

          MoveDef = Pos;

        }

      } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {

        OrderBeforeUse = true;

        if (MoveUse == 0)

          MoveUse = Pos;

        if (MoveUse != 0) {

          OrderAfterDef = true;

          MoveDef = Pos - 1;

        }

      } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {

        // Add the instruction before the scheduled instruction.

        OrderBeforeUse = true;

        if (MoveUse == 0)

          MoveUse = Pos;

      } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&

                 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {

        if (MoveUse == 0) {

          OrderBeforeDef = true;

          MoveUse = Pos;

        }

      }

    }

    // Check for order dependences between instructions. Make sure the source

    // is ordered before the destination.

    for (auto &OE : DDG->getOutEdges(SU)) {

      if (OE.getDst() != *I)

        continue;

      if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {

        OrderBeforeUse = true;

        if (Pos < MoveUse)

          MoveUse = Pos;

      }

      // We did not handle HW dependences in previous for loop,

      // and we normally set Latency = 0 for Anti/Output deps,

      // so may have nodes in same cycle with Anti/Output dependent on HW regs.

      else if ((OE.isAntiDep() || OE.isOutputDep()) &&

               stageScheduled(*I) == StageInst1) {

        OrderBeforeUse = true;

        if ((MoveUse == 0) || (Pos < MoveUse))

          MoveUse = Pos;

      }

    }

    for (auto &IE : DDG->getInEdges(SU)) {

      if (IE.getSrc() != *I)

        continue;

      if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&

          stageScheduled(*I) == StageInst1) {

        OrderAfterDef = true;

        MoveDef = Pos;

      }

    }

  }


  // A circular dependence.

  if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)

    OrderBeforeUse = false;


  // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due

  // to a loop-carried dependence.

  if (OrderBeforeDef)

    OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);


  // The uncommon case when the instruction order needs to be updated because

  // there is both a use and def.

  if (OrderBeforeUse && OrderAfterDef) {

    SUnit *UseSU = Insts.at(MoveUse);

    SUnit *DefSU = Insts.at(MoveDef);

    if (MoveUse > MoveDef) {

      Insts.erase(Insts.begin() + MoveUse);

      Insts.erase(Insts.begin() + MoveDef);

    } else {

      Insts.erase(Insts.begin() + MoveDef);

      Insts.erase(Insts.begin() + MoveUse);

    }

    orderDependence(SSD, UseSU, Insts);

    orderDependence(SSD, SU, Insts);

    orderDependence(SSD, DefSU, Insts);

    return;

  }

  // Put the new instruction first if there is a use in the list. Otherwise,

  // put it at the end of the list.

  if (OrderBeforeUse)

    Insts.push_front(SU);

  else

    Insts.push_back(SU);

}


/// Return true if the scheduled Phi has a loop carried operand.


bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,

                               MachineInstr &Phi) const {

  if (!Phi.isPHI())

    return false;

  assert(Phi.isPHI() && "Expecting a Phi.");

  SUnit *DefSU = SSD->getSUnit(&Phi);

  unsigned DefCycle = cycleScheduled(DefSU);

  int DefStage = stageScheduled(DefSU);


  Register InitVal;

  Register LoopVal;

  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);

  SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));

  if (!UseSU)

    return true;

  if (UseSU->getInstr()->isPHI())

    return true;

  unsigned LoopCycle = cycleScheduled(UseSU);

  int LoopStage = stageScheduled(UseSU);

  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);

}


/// Return true if the instruction is a definition that is loop carried

/// and defines the use on the next iteration.

///        v1 = phi(v2, v3)

///  (Def) v3 = op v1

///  (MO)   = v1

/// If MO appears before Def, then v1 and v3 may get assigned to the same

/// register.


bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,

                                       MachineInstr *Def,

                                       MachineOperand &MO) const {

  if (!MO.isReg())

    return false;

  if (Def->isPHI())

    return false;

  MachineInstr *Phi = MRI.getVRegDef(MO.getReg());

  if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())

    return false;

  if (!isLoopCarried(SSD, *Phi))

    return false;

  Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());

  for (MachineOperand &DMO : Def->all_defs()) {

    if (DMO.getReg() == LoopReg)

      return true;

  }

  return false;

}


/// Return true if all scheduled predecessors are loop-carried output/order

/// dependencies.


bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(

    SUnit *SU, const SwingSchedulerDDG *DDG) const {

  for (const auto &IE : DDG->getInEdges(SU))

    if (InstrToCycle.count(IE.getSrc()))

      return false;

  return true;

}


/// Determine transitive dependences of unpipelineable instructions


SmallPtrSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(

    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {

  SmallPtrSet<SUnit *, 8> DoNotPipeline;

  SmallVector<SUnit *, 8> Worklist;


  for (auto &SU : SSD->SUnits)

    if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))

      Worklist.push_back(&SU);


  const SwingSchedulerDDG *DDG = SSD->getDDG();

  while (!Worklist.empty()) {

    auto SU = Worklist.pop_back_val();

    if (DoNotPipeline.count(SU))

      continue;

    LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");

    DoNotPipeline.insert(SU);

    for (const auto &IE : DDG->getInEdges(SU))

      Worklist.push_back(IE.getSrc());


    // To preserve previous behavior and prevent regression

    // FIXME: Remove if this doesn't have significant impact on

    for (const auto &OE : DDG->getOutEdges(SU))

      if (OE.getDistance() == 1)

        Worklist.push_back(OE.getDst());

  }

  return DoNotPipeline;

}


// Determine all instructions upon which any unpipelineable instruction depends

// and ensure that they are in stage 0.  If unable to do so, return false.


bool SMSchedule::normalizeNonPipelinedInstructions(

    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {

  SmallPtrSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);


  int NewLastCycle = INT_MIN;

  for (SUnit &SU : SSD->SUnits) {

    if (!SU.isInstr())

      continue;

    if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {

      NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);

      continue;

    }


    // Put the non-pipelined instruction as early as possible in the schedule

    int NewCycle = getFirstCycle();

    for (const auto &IE : SSD->getDDG()->getInEdges(&SU))

      if (IE.getDistance() == 0)

        NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);


    // To preserve previous behavior and prevent regression

    // FIXME: Remove if this doesn't have significant impact on performance

    for (auto &OE : SSD->getDDG()->getOutEdges(&SU))

      if (OE.getDistance() == 1)

        NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);


    int OldCycle = InstrToCycle[&SU];

    if (OldCycle != NewCycle) {

      InstrToCycle[&SU] = NewCycle;

      auto &OldS = getInstructions(OldCycle);

      llvm::erase(OldS, &SU);

      getInstructions(NewCycle).emplace_back(&SU);

      LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum

                        << ") is not pipelined; moving from cycle " << OldCycle

                        << " to " << NewCycle << " Instr:" << *SU.getInstr());

    }


    // We traverse the SUs in the order of the original basic block. Computing

    // NewCycle in this order normally works fine because all dependencies

    // (except for loop-carried dependencies) don't violate the original order.

    // However, an artificial dependency (e.g., added by CopyToPhiMutation) can

    // break it. That is, there may be exist an artificial dependency from

    // bottom to top. In such a case, NewCycle may become too large to be

    // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the

    // following case:

    //

    //             |  Inst0  <-+

    //   SU order  |           | artificial dep

    //             |  Inst1  --+

    //             v

    //

    // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of

    // Inst0 must be greater than or equal to N so that Inst0 is not be

    // scheduled at Stage 0. In such cases, we reject this schedule at this

    // time.

    // FIXME: The reason for this is the existence of artificial dependencies

    // that are contradict to the original SU order. If ignoring artificial

    // dependencies does not affect correctness, then it is better to ignore

    // them.

    if (FirstCycle + InitiationInterval <= NewCycle)

      return false;


    NewLastCycle = std::max(NewLastCycle, NewCycle);

  }

  LastCycle = NewLastCycle;

  return true;

}


// Check if the generated schedule is valid. This function checks if

// an instruction that uses a physical register is scheduled in a

// different stage than the definition. The pipeliner does not handle

// physical register values that may cross a basic block boundary.

// Furthermore, if a physical def/use pair is assigned to the same

// cycle, orderDependence does not guarantee def/use ordering, so that

// case should be considered invalid.  (The test checks for both

// earlier and same-cycle use to be more robust.)


bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {

  for (SUnit &SU : SSD->SUnits) {

    if (!SU.hasPhysRegDefs)

      continue;

    int StageDef = stageScheduled(&SU);

    int CycleDef = InstrToCycle[&SU];

    assert(StageDef != -1 && "Instruction should have been scheduled.");

    for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {

      SUnit *Dst = OE.getDst();

      if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())

        if (OE.getReg().isPhysical()) {

          if (stageScheduled(Dst) != StageDef)

            return false;

          if (InstrToCycle[Dst] <= CycleDef)

            return false;

        }

    }

  }

  return true;

}


/// A property of the node order in swing-modulo-scheduling is

/// that for nodes outside circuits the following holds:

/// none of them is scheduled after both a successor and a

/// predecessor.

/// The method below checks whether the property is met.

/// If not, debug information is printed and statistics information updated.

/// Note that we do not use an assert statement.

/// The reason is that although an invalid node order may prevent

/// the pipeliner from finding a pipelined schedule for arbitrary II,

/// it does not lead to the generation of incorrect code.

void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {


  // a sorted vector that maps each SUnit to its index in the NodeOrder

  typedef std::pair<SUnit *, unsigned> UnitIndex;

  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));


  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)

    Indices.push_back(std::make_pair(NodeOrder[i], i));


  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {

    return std::get<0>(i1) < std::get<0>(i2);

  };


  // sort, so that we can perform a binary search

  llvm::sort(Indices, CompareKey);


  bool Valid = true;

  (void)Valid;

  // for each SUnit in the NodeOrder, check whether

  // it appears after both a successor and a predecessor

  // of the SUnit. If this is the case, and the SUnit

  // is not part of circuit, then the NodeOrder is not

  // valid.

  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {

    SUnit *SU = NodeOrder[i];

    unsigned Index = i;


    bool PredBefore = false;

    bool SuccBefore = false;


    SUnit *Succ;

    SUnit *Pred;

    (void)Succ;

    (void)Pred;


    for (const auto &IE : DDG->getInEdges(SU)) {

      SUnit *PredSU = IE.getSrc();

      unsigned PredIndex = std::get<1>(

          *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));

      if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {

        PredBefore = true;

        Pred = PredSU;

        break;

      }

    }


    for (const auto &OE : DDG->getOutEdges(SU)) {

      SUnit *SuccSU = OE.getDst();

      // Do not process a boundary node, it was not included in NodeOrder,

      // hence not in Indices either, call to std::lower_bound() below will

      // return Indices.end().

      if (SuccSU->isBoundaryNode())

        continue;

      unsigned SuccIndex = std::get<1>(

          *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));

      if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {

        SuccBefore = true;

        Succ = SuccSU;

        break;

      }

    }


    if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {

      // instructions in circuits are allowed to be scheduled

      // after both a successor and predecessor.

      bool InCircuit = llvm::any_of(

          Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });

      if (InCircuit)

        LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");

      else {

        Valid = false;

        NumNodeOrderIssues++;

        LLVM_DEBUG(dbgs() << "Predecessor ");

      }

      LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum

                        << " are scheduled before node " << SU->NodeNum

                        << "\n");

    }

  }


  LLVM_DEBUG({

    if (!Valid)

      dbgs() << "Invalid node order found!\n";

  });

}


/// Attempt to fix the degenerate cases when the instruction serialization

/// causes the register lifetimes to overlap. For example,

///   p' = store_pi(p, b)

///      = load p, offset

/// In this case p and p' overlap, which means that two registers are needed.

/// Instead, this function changes the load to use p' and updates the offset.


void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {

  Register OverlapReg;

  Register NewBaseReg;

  for (SUnit *SU : Instrs) {

    MachineInstr *MI = SU->getInstr();

    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {

      const MachineOperand &MO = MI->getOperand(i);

      // Look for an instruction that uses p. The instruction occurs in the

      // same cycle but occurs later in the serialized order.

      if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {

        // Check that the instruction appears in the InstrChanges structure,

        // which contains instructions that can have the offset updated.

        DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =

            InstrChanges.find(SU);

        if (It != InstrChanges.end()) {

          unsigned BasePos, OffsetPos;

          // Update the base register and adjust the offset.

          if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {

            MachineInstr *NewMI = MF.CloneMachineInstr(MI);

            NewMI->getOperand(BasePos).setReg(NewBaseReg);

            int64_t NewOffset =

                MI->getOperand(OffsetPos).getImm() - It->second.second;

            NewMI->getOperand(OffsetPos).setImm(NewOffset);

            SU->setInstr(NewMI);

            MISUnitMap[NewMI] = SU;

            NewMIs[MI] = NewMI;

          }

        }

        OverlapReg = Register();

        NewBaseReg = Register();

        break;

      }

      // Look for an instruction of the form p' = op(p), which uses and defines

      // two virtual registers that get allocated to the same physical register.

      unsigned TiedUseIdx = 0;

      if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {

        // OverlapReg is p in the example above.

        OverlapReg = MI->getOperand(TiedUseIdx).getReg();

        // NewBaseReg is p' in the example above.

        NewBaseReg = MI->getOperand(i).getReg();

        break;

      }

    }

  }

}


std::deque<SUnit *>


SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,

                                const std::deque<SUnit *> &Instrs) const {

  std::deque<SUnit *> NewOrderPhi;

  for (SUnit *SU : Instrs) {

    if (SU->getInstr()->isPHI())

      NewOrderPhi.push_back(SU);

  }

  std::deque<SUnit *> NewOrderI;

  for (SUnit *SU : Instrs) {

    if (!SU->getInstr()->isPHI())

      orderDependence(SSD, SU, NewOrderI);

  }

  llvm::append_range(NewOrderPhi, NewOrderI);

  return NewOrderPhi;

}


/// After the schedule has been formed, call this function to combine

/// the instructions from the different stages/cycles.  That is, this

/// function creates a schedule that represents a single iteration.


void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {

  // Move all instructions to the first stage from later stages.

  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {

    for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;

         ++stage) {

      std::deque<SUnit *> &cycleInstrs =

          ScheduledInstrs[cycle + (stage * InitiationInterval)];

      for (SUnit *SU : llvm::reverse(cycleInstrs))

        ScheduledInstrs[cycle].push_front(SU);

    }

  }


  // Erase all the elements in the later stages. Only one iteration should

  // remain in the scheduled list, and it contains all the instructions.

  for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)

    ScheduledInstrs.erase(cycle);


  // Change the registers in instruction as specified in the InstrChanges

  // map. We need to use the new registers to create the correct order.

  for (const SUnit &SU : SSD->SUnits)

    SSD->applyInstrChange(SU.getInstr(), *this);


  // Reorder the instructions in each cycle to fix and improve the

  // generated code.

  for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {

    std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];

    cycleInstrs = reorderInstructions(SSD, cycleInstrs);

    SSD->fixupRegisterOverlaps(cycleInstrs);

  }


  LLVM_DEBUG(dump(););

}


void NodeSet::print(raw_ostream &os) const {

  os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV

     << " depth " << MaxDepth << " col " << Colocate << "\n";

  for (const auto &I : Nodes)

    os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());

  os << "\n";

}


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

/// Print the schedule information to the given output.


void SMSchedule::print(raw_ostream &os) const {

  // Iterate over each cycle.

  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {

    // Iterate over each instruction in the cycle.

    const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);

    for (SUnit *CI : cycleInstrs->second) {

      os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";

      os << "(" << CI->NodeNum << ") ";

      CI->getInstr()->print(os);

      os << "\n";

    }

  }

}


/// Utility function used for debugging to print the schedule.

LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }

LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }


void ResourceManager::dumpMRT() const {

  LLVM_DEBUG({

    if (UseDFA)

      return;

    std::stringstream SS;

    SS << "MRT:\n";

    SS << std::setw(4) << "Slot";

    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)

      SS << std::setw(3) << I;

    SS << std::setw(7) << "#Mops"

       << "\n";

    for (int Slot = 0; Slot < InitiationInterval; ++Slot) {

      SS << std::setw(4) << Slot;

      for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)

        SS << std::setw(3) << MRT[Slot][I];

      SS << std::setw(7) << NumScheduledMops[Slot] << "\n";

    }

    dbgs() << SS.str();

  });

}

#endif


void ResourceManager::initProcResourceVectors(

    const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {

  unsigned ProcResourceID = 0;


  // We currently limit the resource kinds to 64 and below so that we can use

  // uint64_t for Masks

  assert(SM.getNumProcResourceKinds() < 64 &&

         "Too many kinds of resources, unsupported");

  // Create a unique bitmask for every processor resource unit.

  // Skip resource at index 0, since it always references 'InvalidUnit'.

  Masks.resize(SM.getNumProcResourceKinds());

  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

    const MCProcResourceDesc &Desc = *SM.getProcResource(I);

    if (Desc.SubUnitsIdxBegin)

      continue;

    Masks[I] = 1ULL << ProcResourceID;

    ProcResourceID++;

  }

  // Create a unique bitmask for every processor resource group.

  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

    const MCProcResourceDesc &Desc = *SM.getProcResource(I);

    if (!Desc.SubUnitsIdxBegin)

      continue;

    Masks[I] = 1ULL << ProcResourceID;

    for (unsigned U = 0; U < Desc.NumUnits; ++U)

      Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];

    ProcResourceID++;

  }

  LLVM_DEBUG({

    if (SwpShowResMask) {

      dbgs() << "ProcResourceDesc:\n";

      for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

        const MCProcResourceDesc *ProcResource = SM.getProcResource(I);

        dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",

                         ProcResource->Name, I, Masks[I],

                         ProcResource->NumUnits);

      }

      dbgs() << " -----------------\n";

    }

  });

}


bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {

  LLVM_DEBUG({

    if (SwpDebugResource)

      dbgs() << "canReserveResources:\n";

  });

  if (UseDFA)

    return DFAResources[positiveModulo(Cycle, InitiationInterval)]

        ->canReserveResources(&SU.getInstr()->getDesc());


  const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);

  if (!SCDesc->isValid()) {

    LLVM_DEBUG({

      dbgs() << "No valid Schedule Class Desc for schedClass!\n";

      dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";

    });

    return true;

  }


  reserveResources(SCDesc, Cycle);

  bool Result = !isOverbooked();

  unreserveResources(SCDesc, Cycle);


  LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");

  return Result;

}


void ResourceManager::reserveResources(SUnit &SU, int Cycle) {

  LLVM_DEBUG({

    if (SwpDebugResource)

      dbgs() << "reserveResources:\n";

  });

  if (UseDFA)

    return DFAResources[positiveModulo(Cycle, InitiationInterval)]

        ->reserveResources(&SU.getInstr()->getDesc());


  const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);

  if (!SCDesc->isValid()) {

    LLVM_DEBUG({

      dbgs() << "No valid Schedule Class Desc for schedClass!\n";

      dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";

    });

    return;

  }


  reserveResources(SCDesc, Cycle);


  LLVM_DEBUG({

    if (SwpDebugResource) {

      dumpMRT();

      dbgs() << "reserveResources: done!\n\n";

    }

  });

}


void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,

                                       int Cycle) {

  assert(!UseDFA);

  for (const MCWriteProcResEntry &PRE : make_range(

           STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))

    for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)

      ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];


  for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)

    ++NumScheduledMops[positiveModulo(C, InitiationInterval)];

}


void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,

                                         int Cycle) {

  assert(!UseDFA);

  for (const MCWriteProcResEntry &PRE : make_range(

           STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))

    for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)

      --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];


  for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)

    --NumScheduledMops[positiveModulo(C, InitiationInterval)];

}


bool ResourceManager::isOverbooked() const {

  assert(!UseDFA);

  for (int Slot = 0; Slot < InitiationInterval; ++Slot) {

    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

      const MCProcResourceDesc *Desc = SM.getProcResource(I);

      if (MRT[Slot][I] > Desc->NumUnits)

        return true;

    }

    if (NumScheduledMops[Slot] > IssueWidth)

      return true;

  }

  return false;

}


int ResourceManager::calculateResMIIDFA() const {

  assert(UseDFA);


  // Sort the instructions by the number of available choices for scheduling,

  // least to most. Use the number of critical resources as the tie breaker.

  FuncUnitSorter FUS = FuncUnitSorter(*ST);

  for (SUnit &SU : DAG->SUnits)

    FUS.calcCriticalResources(*SU.getInstr());

  PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>

      FuncUnitOrder(FUS);


  for (SUnit &SU : DAG->SUnits)

    FuncUnitOrder.push(SU.getInstr());


  SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources;

  Resources.push_back(

      std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));


  while (!FuncUnitOrder.empty()) {

    MachineInstr *MI = FuncUnitOrder.top();

    FuncUnitOrder.pop();

    if (TII->isZeroCost(MI->getOpcode()))

      continue;


    // Attempt to reserve the instruction in an existing DFA. At least one

    // DFA is needed for each cycle.

    unsigned NumCycles = DAG->getSUnit(MI)->Latency;

    unsigned ReservedCycles = 0;

    auto *RI = Resources.begin();

    auto *RE = Resources.end();

    LLVM_DEBUG({

      dbgs() << "Trying to reserve resource for " << NumCycles

             << " cycles for \n";

      MI->dump();

    });

    for (unsigned C = 0; C < NumCycles; ++C)

      while (RI != RE) {

        if ((*RI)->canReserveResources(*MI)) {

          (*RI)->reserveResources(*MI);

          ++ReservedCycles;

          break;

        }

        RI++;

      }

    LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles

                      << ", NumCycles:" << NumCycles << "\n");

    // Add new DFAs, if needed, to reserve resources.

    for (unsigned C = ReservedCycles; C < NumCycles; ++C) {

      LLVM_DEBUG(if (SwpDebugResource) dbgs()

                 << "NewResource created to reserve resources"

                 << "\n");

      auto *NewResource = TII->CreateTargetScheduleState(*ST);

      assert(NewResource->canReserveResources(*MI) && "Reserve error.");

      NewResource->reserveResources(*MI);

      Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));

    }

  }


  int Resmii = Resources.size();

  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");

  return Resmii;

}


int ResourceManager::calculateResMII() const {

  if (UseDFA)

    return calculateResMIIDFA();


  // Count each resource consumption and divide it by the number of units.

  // ResMII is the max value among them.


  int NumMops = 0;

  SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());

  for (SUnit &SU : DAG->SUnits) {

    if (TII->isZeroCost(SU.getInstr()->getOpcode()))

      continue;


    const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);

    if (!SCDesc->isValid())

      continue;


    LLVM_DEBUG({

      if (SwpDebugResource) {

        DAG->dumpNode(SU);

        dbgs() << "  #Mops: " << SCDesc->NumMicroOps << "\n"

               << "  WriteProcRes: ";

      }

    });

    NumMops += SCDesc->NumMicroOps;

    for (const MCWriteProcResEntry &PRE :

         make_range(STI->getWriteProcResBegin(SCDesc),

                    STI->getWriteProcResEnd(SCDesc))) {

      LLVM_DEBUG({

        if (SwpDebugResource) {

          const MCProcResourceDesc *Desc =

              SM.getProcResource(PRE.ProcResourceIdx);

          dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";

        }

      });

      ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;

    }

    LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");

  }


  int Result = (NumMops + IssueWidth - 1) / IssueWidth;

  LLVM_DEBUG({

    if (SwpDebugResource)

      dbgs() << "#Mops: " << NumMops << ", "

             << "IssueWidth: " << IssueWidth << ", "

             << "Cycles: " << Result << "\n";

  });


  LLVM_DEBUG({

    if (SwpDebugResource) {

      std::stringstream SS;

      SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)

         << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"

         << "\n";

      dbgs() << SS.str();

    }

  });

  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {

    const MCProcResourceDesc *Desc = SM.getProcResource(I);

    int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;

    LLVM_DEBUG({

      if (SwpDebugResource) {

        std::stringstream SS;

        SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)

           << Desc->NumUnits << std::setw(10) << ResourceCount[I]

           << std::setw(10) << Cycles << "\n";

        dbgs() << SS.str();

      }

    });

    if (Cycles > Result)

      Result = Cycles;

  }

  return Result;

}


void ResourceManager::init(int II) {

  InitiationInterval = II;

  DFAResources.clear();

  DFAResources.resize(II);

  for (auto &I : DFAResources)

    I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));

  MRT.clear();

  MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));

  NumScheduledMops.clear();

  NumScheduledMops.resize(II);

}


bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {

  if (Pred.isArtificial() || Dst->isBoundaryNode())

    return true;

  // Currently, dependence that is an anti-dependences but not a loop-carried is

  // also ignored. This behavior is preserved to prevent regression.

  // FIXME: Remove if this doesn't have significant impact on performance

  return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);

}


SwingSchedulerDDG::SwingSchedulerDDGEdges &

SwingSchedulerDDG::getEdges(const SUnit *SU) {

  if (SU == EntrySU)

    return EntrySUEdges;

  if (SU == ExitSU)

    return ExitSUEdges;

  return EdgesVec[SU->NodeNum];

}


const SwingSchedulerDDG::SwingSchedulerDDGEdges &

SwingSchedulerDDG::getEdges(const SUnit *SU) const {

  if (SU == EntrySU)

    return EntrySUEdges;

  if (SU == ExitSU)

    return ExitSUEdges;

  return EdgesVec[SU->NodeNum];

}


void SwingSchedulerDDG::addEdge(const SUnit *SU,

                                const SwingSchedulerDDGEdge &Edge) {

  assert(!Edge.isValidationOnly() &&

         "Validation-only edges are not expected here.");

  auto &Edges = getEdges(SU);

  if (Edge.getSrc() == SU)

    Edges.Succs.push_back(Edge);

  else

    Edges.Preds.push_back(Edge);

}


void SwingSchedulerDDG::initEdges(SUnit *SU) {

  for (const auto &PI : SU->Preds) {

    SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,

                               /*IsValidationOnly=*/false);

    addEdge(SU, Edge);

  }


  for (const auto &SI : SU->Succs) {

    SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,

                               /*IsValidationOnly=*/false);

    addEdge(SU, Edge);

  }

}


SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,

                                     SUnit *ExitSU, const LoopCarriedEdges &LCE)

    : EntrySU(EntrySU), ExitSU(ExitSU) {

  EdgesVec.resize(SUnits.size());


  // Add non-loop-carried edges based on the DAG.

  initEdges(EntrySU);

  initEdges(ExitSU);

  for (auto &SU : SUnits)

    initEdges(&SU);


  // Add loop-carried edges, which are not represented in the DAG.

  for (SUnit &SU : SUnits) {

    SUnit *Src = &SU;

    if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {

      SDep Base(Src, SDep::Barrier);

      Base.setLatency(1);

      for (SUnit *Dst : *OD) {

        SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,

                                   /*IsValidationOnly=*/true);

        Edge.setDistance(1);

        ValidationOnlyEdges.push_back(Edge);

      }

    }

  }

}


const SwingSchedulerDDG::EdgesType &


SwingSchedulerDDG::getInEdges(const SUnit *SU) const {

  return getEdges(SU).Preds;

}


const SwingSchedulerDDG::EdgesType &


SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {

  return getEdges(SU).Succs;

}


/// Check if \p Schedule doesn't violate the validation-only dependencies.


bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {

  unsigned II = Schedule.getInitiationInterval();


  auto ExpandCycle = [&](SUnit *SU) {

    int Stage = Schedule.stageScheduled(SU);

    int Cycle = Schedule.cycleScheduled(SU);

    return Cycle + (Stage * II);

  };


  for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {

    SUnit *Src = Edge.getSrc();

    SUnit *Dst = Edge.getDst();

    if (!Src->isInstr() || !Dst->isInstr())

      continue;

    int CycleSrc = ExpandCycle(Src);

    int CycleDst = ExpandCycle(Dst);

    int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();

    if (CycleSrc > MaxLateStart) {

      LLVM_DEBUG({

        dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "

               << Dst->NodeNum << "\n";

      });

      return false;

    }

  }

  return true;

}


void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,

                                    const TargetInstrInfo *TII) {

  for (SUnit &SU : SUnits) {

    SUnit *Src = &SU;

    if (auto *OrderDep = getOrderDepOrNull(Src)) {

      SDep Dep(Src, SDep::Barrier);

      Dep.setLatency(1);

      for (SUnit *Dst : *OrderDep) {

        SUnit *From = Src;

        SUnit *To = Dst;

        if (From->NodeNum > To->NodeNum)

          std::swap(From, To);


        // Add a forward edge if the following conditions are met:

        //

        // - The instruction of the source node (FromMI) may read memory.

        // - The instruction of the target node (ToMI) may modify memory, but

        //   does not read it.

        // - Neither instruction is a global barrier.

        // - The load appears before the store in the original basic block.

        // - There are no barrier or store instructions between the two nodes.

        // - The target node is unreachable from the source node in the current

        //   DAG.

        //

        // TODO: These conditions are inherited from a previous implementation,

        // and some may no longer be necessary. For now, we conservatively

        // retain all of them to avoid regressions, but the logic could

        // potentially be simplified

        MachineInstr *FromMI = From->getInstr();

        MachineInstr *ToMI = To->getInstr();

        if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&

            !TII->isGlobalMemoryObject(FromMI) &&

            !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {

          SDep Pred = Dep;

          Pred.setSUnit(From);

          To->addPred(Pred);

        }

      }

    }

  }

}


void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI,

                            const MachineRegisterInfo *MRI) const {

  const auto *Order = getOrderDepOrNull(SU);


  if (!Order)

    return;


  const auto DumpSU = [](const SUnit *SU) {

    std::ostringstream OSS;

    OSS << "SU(" << SU->NodeNum << ")";

    return OSS.str();

  };


  dbgs() << "  Loop carried edges from " << DumpSU(SU) << "\n"

         << "    Order\n";

  for (SUnit *Dst : *Order)

    dbgs() << "      " << DumpSU(Dst) << "\n";

}


MRI
unsigned const MachineRegisterInfo * MRI
Definition AArch64AdvSIMDScalarPass.cpp:103

UseMI
MachineInstrBuilder & UseMI
Definition AArch64ExpandPseudoInsts.cpp:120

DefMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Definition AArch64ExpandPseudoInsts.cpp:121

getTag
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
Definition AArch64FalkorHWPFFix.cpp:650

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

S1
constexpr LLT S1
Definition AMDGPULegalizerInfo.cpp:294

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AliasAnalysis.h

print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition ArchiveWriter.cpp:205

ArrayRef.h

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

BitVector.h
This file implements the BitVector class.

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CommandLine.h

clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition CommandLine.h:688

Compiler.h

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638

DFAPacketizer.h

DenseMap.h
This file defines the DenseMap class.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

Function.h

LaneBitmask.h
A common definition of LaneBitmask for use in TableGen and CodeGen.

addEdge
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
Definition LazyCallGraph.cpp:62

LiveIntervals.h

MCInstrDesc.h

MCInstrItineraries.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

Pass
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:270

MachineBasicBlock.h

MachineDominators.h

MachineFunctionPass.h

MachineFunction.h

MachineInstrBuilder.h

MachineInstr.h

MachineLoopInfo.h

MachineMemOperand.h

MachineOperand.h

SwpForceII
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.

ExperimentalCodeGen
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))

MVECodeGen
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))

RegPressureMargin
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))

getPhiRegs
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
Definition MachinePipeliner.cpp:903

SwpDebugResource
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))

computeLiveOuts
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
Definition MachinePipeliner.cpp:2233

computeScheduledInsts
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
Definition MachinePipeliner.cpp:1334

EmitTestAnnotations
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))

getLoopPhiReg
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
Definition MachinePipeliner.cpp:919

isIntersect
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
Definition MachinePipeliner.cpp:2440

findLoopIncrementValue
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
Definition MachinePipeliner.cpp:2799

SwpIgnoreRecMII
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))

SwpLoopLimit
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))

SwpPruneLoopCarried
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.

SwpMaxNumStores
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.

SwpMaxMii
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.

isSuccOrder
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
Definition MachinePipeliner.cpp:928

SwpMaxStages
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.

EnableSWPOptSize
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.

WindowSchedulingOption
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.

pred_L
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
Definition MachinePipeliner.cpp:2132

hasLoopCarriedMemDep
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
Definition MachinePipeliner.cpp:995

SwpShowResMask
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))

SwpIISearchRange
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))

computePath
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
Definition MachinePipeliner.cpp:2203

succ_L
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Definition MachinePipeliner.cpp:2168

LimitRegPressure
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))

EnableSWP
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.

SwpPruneDeps
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.

multipleIterations
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
Definition MachinePipeliner.cpp:3239

findUniqueOperandDefinedInLoop
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Definition MachinePipeliner.cpp:2781

MachinePipeliner.h

MachineRegisterInfo.h

Reg
Register Reg
Definition MachineSink.cpp:2117

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2118

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

MemoryLocation.h
This file provides utility analysis objects describing memory locations.

ModuloSchedule.h

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

OptimizationRemarkEmitter.h

P
#define P(N)

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

Pass.h

PriorityQueue.h
This file defines the PriorityQueue class.

RegisterClassInfo.h

RegisterPressure.h

Register.h

Uses
Remove Loads Into Fake Uses
Definition RemoveLoadsIntoFakeUses.cpp:81

Edge
std::pair< BasicBlock *, BasicBlock * > Edge
Definition SPIRVStructurizer.cpp:38

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScheduleDAGMutation.h

ScheduleDAG.h

SetOperations.h
This file defines generic set operations that may be used on set's of different types,...

SetVector.h
This file implements a set that has insertion order iteration characteristics.

SmallPtrSet.h
This file defines the SmallPtrSet class.

SmallSet.h
This file defines the SmallSet class.

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

TargetInstrInfo.h

TargetOpcodes.h

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

TargetRegisterInfo.h

TargetSubtargetInfo.h

ValueTracking.h

false::LoopCarriedOrderDepsTracker
Add loop-carried chain dependencies.
Definition MachinePipeliner.cpp:276

false::LoopCarriedOrderDepsTracker::computeDependencies
void computeDependencies()
The main function to compute loop-carried order-dependencies.
Definition MachinePipeliner.cpp:1070

false::LoopCarriedOrderDepsTracker::getLoopCarried
const BitVector & getLoopCarried(unsigned Idx) const
Definition MachinePipeliner.cpp:335

false::LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
Definition MachinePipeliner.cpp:1064

llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition AliasAnalysis.h:1022

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition PassAnalysisSupport.h:99

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233

llvm::BatchAAResults
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
Definition AliasAnalysis.h:657

llvm::BatchAAResults::isNoAlias
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
Definition AliasAnalysis.h:701

llvm::BitVector
Definition BitVector.h:101

llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237

llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition DenseMap.h:311

llvm::DenseMapBase::empty
bool empty() const
Definition DenseMap.h:109

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222

llvm::DenseMap
Definition DenseMap.h:701

llvm::FunctionPass::skipFunction
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188

llvm::Function::getAttributes
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352

llvm::HexagonInstrInfo::areMemAccessesTriviallyDisjoint
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
Definition HexagonInstrInfo.cpp:1990

llvm::HexagonInstrInfo::isPostIncrement
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
Definition HexagonInstrInfo.cpp:1659

llvm::HexagonInstrInfo::CreateTargetScheduleState
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
Definition HexagonInstrInfo.cpp:1980

llvm::HexagonInstrInfo::getBaseAndOffsetPosition
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
Definition HexagonInstrInfo.cpp:3328

llvm::InstrItineraryData::beginStage
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
Definition MCInstrItineraries.h:136

llvm::InstrItineraryData::endStage
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
Definition MCInstrItineraries.h:142

llvm::InstrItineraryData::isEmpty
bool isEmpty() const
Returns true if there are no itineraries.
Definition MCInstrItineraries.h:127

llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition Instruction.h:428

llvm::LiveIntervalsWrapperPass
Definition LiveIntervals.h:532

llvm::LocationSize
Definition MemoryLocation.h:67

llvm::LocationSize::hasValue
bool hasValue() const
Definition MemoryLocation.h:153

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MCInstrDesc::getSchedClass
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition MCInstrDesc.h:614

llvm::MCSubtargetInfo::getWriteProcResEnd
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
Definition MCSubtargetInfo.h:174

llvm::MCSubtargetInfo::getWriteProcResBegin
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
Definition MCSubtargetInfo.h:170

llvm::MCSubtargetInfo::getSchedModel
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Definition MCSubtargetInfo.h:166

llvm::MDNode::getOperand
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442

llvm::MDNode::operands
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440

llvm::MDNode::getNumOperands
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448

llvm::MDString::getString
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:618

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:122

llvm::MachineBasicBlock::const_iterator
MachineInstrBundleIterator< const MachineInstr > const_iterator
Definition MachineBasicBlock.h:342

llvm::MachineBasicBlock::getBasicBlock
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
Definition MachineBasicBlock.h:253

llvm::MachineBasicBlock::getFirstTerminator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition MachineBasicBlock.cpp:242

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:377

llvm::MachineBasicBlock::findDebugLoc
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Definition MachineBasicBlock.cpp:1552

llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition MachineBasicBlock.h:363

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineBasicBlock::size
unsigned size() const
Definition MachineBasicBlock.h:347

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:341

llvm::MachineDominatorTreeWrapperPass
Analysis pass which computes a MachineDominatorTree.
Definition MachineDominators.h:127

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachineFunctionPass.cpp:184

llvm::MachineFunction
Definition MachineFunction.h:286

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:762

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:772

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:733

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:126

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:72

llvm::MachineInstr::mayRaiseFPException
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
Definition MachineInstr.h:1169

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:587

llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition MachineInstr.h:1159

llvm::MachineInstr::isCopy
bool isCopy() const
Definition MachineInstr.h:1431

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:359

llvm::MachineInstr::all_defs
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition MachineInstr.h:754

llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition MachineInstr.h:1136

llvm::MachineInstr::getDesc
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition MachineInstr.h:584

llvm::MachineInstr::hasUnmodeledSideEffects
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
Definition MachineInstr.cpp:1642

llvm::MachineInstr::isRegSequence
bool isRegSequence() const
Definition MachineInstr.h:1423

llvm::MachineInstr::memoperands_begin
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition MachineInstr.h:798

llvm::MachineInstr::isIdenticalTo
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
Definition MachineInstr.cpp:658

llvm::MachineInstr::hasOrderedMemoryRef
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
Definition MachineInstr.cpp:1571

llvm::MachineInstr::print
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition MachineInstr.cpp:1770

llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition MachineInstr.h:1149

llvm::MachineInstr::isPseudo
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition MachineInstr.h:928

llvm::MachineInstr::dump
LLVM_ABI void dump() const
Definition MachineInstr.cpp:1733

llvm::MachineInstr::isPHI
bool isPHI() const
Definition MachineInstr.h:1397

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:595

llvm::MachineLoopInfoWrapperPass
Definition MachineLoopInfo.h:161

llvm::MachineLoop
Definition MachineLoopInfo.h:48

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::getAAInfo
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Definition MachineMemOperand.h:269

llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition MachineMemOperand.h:216

llvm::MachineMemOperand::getOffset
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
Definition MachineMemOperand.h:234

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:48

llvm::MachineOperand::setSubReg
void setSubReg(unsigned subReg)
Definition MachineOperand.h:489

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:373

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:685

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:556

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:328

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isUse
bool isUse() const
Definition MachineOperand.h:378

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:368

llvm::MachineOperand::isIdenticalTo
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Definition MachineOperand.cpp:331

llvm::MachineOptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition MachineOptimizationRemarkEmitter.h:112

llvm::MachineOptimizationRemarkEmitterPass
The analysis pass.
Definition MachineOptimizationRemarkEmitter.h:242

llvm::MachineOptimizationRemarkEmitter::emit
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Definition MachineOptimizationRemarkEmitter.cpp:57

llvm::MachineOptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition MachineOptimizationRemarkEmitter.h:86

llvm::MachineOptimizationRemark
Diagnostic information for applied optimization remarks.
Definition MachineOptimizationRemarkEmitter.h:60

llvm::MachinePipeliner
The main class in the implementation of the target independent software pipeliner pass.
Definition MachinePipeliner.h:69

llvm::MachinePipeliner::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
Definition MachinePipeliner.cpp:363

llvm::MachinePipeliner::TII
const TargetInstrInfo * TII
Definition MachinePipeliner.h:76

llvm::MachinePipeliner::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachinePipeliner.cpp:638

llvm::MachinePipeliner::MF
MachineFunction * MF
Definition MachinePipeliner.h:71

llvm::MachinePipeliner::LI
LoopInfo LI
Definition MachinePipeliner.h:95

llvm::MachinePipeliner::MDT
const MachineDominatorTree * MDT
Definition MachinePipeliner.h:74

llvm::MachinePipeliner::MLI
const MachineLoopInfo * MLI
Definition MachinePipeliner.h:73

llvm::MachinePipeliner::NumTries
static int NumTries
Definition MachinePipeliner.h:82

llvm::MachinePipeliner::disabledByPragma
bool disabledByPragma
Definition MachinePipeliner.h:78

llvm::MachinePipeliner::II_setByPragma
unsigned II_setByPragma
Definition MachinePipeliner.h:79

llvm::MachinePipeliner::ORE
MachineOptimizationRemarkEmitter * ORE
Definition MachinePipeliner.h:72

llvm::MachinePipeliner::RegClassInfo
RegisterClassInfo RegClassInfo
Definition MachinePipeliner.h:77

llvm::MachinePipeliner::ID
static char ID
Definition MachinePipeliner.h:97

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::use_instr_iterator
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
Definition MachineRegisterInfo.h:478

llvm::MemoryLocation::getBeforeOrAfter
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Definition MemoryLocation.h:285

llvm::ModuloScheduleExpanderMVE
Expand the kernel using modulo variable expansion algorithm (MVE).
Definition ModuloSchedule.h:375

llvm::ModuloScheduleExpanderMVE::expand
void expand()
Definition ModuloSchedule.cpp:2707

llvm::ModuloScheduleExpanderMVE::canApply
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
Definition ModuloSchedule.cpp:2718

llvm::ModuloScheduleExpander
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
Definition ModuloSchedule.h:161

llvm::ModuloScheduleExpander::cleanup
void cleanup()
Performs final cleanup after expansion.
Definition ModuloSchedule.cpp:189

llvm::ModuloScheduleExpander::expand
void expand()
Performs the actual expansion.
Definition ModuloSchedule.cpp:72

llvm::ModuloScheduleTestAnnotater
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
Definition ModuloSchedule.h:451

llvm::ModuloScheduleTestAnnotater::annotate
void annotate()
Performs the annotation.
Definition ModuloSchedule.cpp:2866

llvm::ModuloSchedule
Represents a schedule for a single-block loop.
Definition ModuloSchedule.h:80

llvm::NodeSet
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
Definition MachinePipeliner.h:481

llvm::NodeSet::getNode
SUnit * getNode(unsigned i) const
Definition MachinePipeliner.h:566

llvm::NodeSet::print
void print(raw_ostream &os) const
Definition MachinePipeliner.cpp:3818

llvm::NodeSet::begin
iterator begin()
Definition MachinePipeliner.h:626

llvm::NodeSet::setRecMII
void setRecMII(unsigned mii)
Definition MachinePipeliner.h:568

llvm::NodeSet::count
unsigned count(SUnit *SU) const
Definition MachinePipeliner.h:558

llvm::NodeSet::setColocate
void setColocate(unsigned c)
Definition MachinePipeliner.h:570

llvm::NodeSet::getRecMII
int getRecMII()
Definition MachinePipeliner.h:578

llvm::NodeSet::compareRecMII
int compareRecMII(NodeSet &RHS)
Definition MachinePipeliner.h:576

llvm::NodeSet::insert
bool insert(SUnit *SU)
Definition MachinePipeliner.h:550

llvm::NodeSet::dump
LLVM_DUMP_METHOD void dump() const
Definition MachinePipeliner.cpp:3844

llvm::NodeSet::empty
bool empty() const
Definition MachinePipeliner.h:564

llvm::NodeSet::clear
void clear()
Definition MachinePipeliner.h:592

llvm::Pass::dump
void dump() const
Definition Pass.cpp:146

llvm::Pass::getAnalysis
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Definition PassAnalysisSupport.h:224

llvm::PeelingModuloScheduleExpander
A reimplementation of ModuloScheduleExpander.
Definition ModuloSchedule.h:283

llvm::PeelingModuloScheduleExpander::expand
void expand()
Definition ModuloSchedule.cpp:2015

llvm::PointerIntPair
PointerIntPair - This class implements a pair of a pointer and small integer.
Definition PointerIntPair.h:80

llvm::PressureChange::getUnitInc
int getUnitInc() const
Definition RegisterPressure.h:125

llvm::PressureChange::isValid
bool isValid() const
Definition RegisterPressure.h:113

llvm::PressureChange::getPSet
unsigned getPSet() const
Definition RegisterPressure.h:115

llvm::RegPressureTracker
Track the current register pressure at some position in the instruction stream, and remember the high...
Definition RegisterPressure.h:361

llvm::RegPressureTracker::addLiveRegs
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
Definition RegisterPressure.cpp:693

llvm::RegisterClassInfo::getRegPressureSetLimit
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Definition RegisterClassInfo.h:151

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:19

llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102

llvm::Register::isValid
constexpr bool isValid() const
Definition Register.h:107

llvm::Register::isVirtual
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78

llvm::ResourceManager::calculateResMII
int calculateResMII() const
Definition MachinePipeliner.cpp:4065

llvm::ResourceManager::initProcResourceVectors
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
Definition MachinePipeliner.cpp:3868

llvm::ResourceManager::init
void init(int II)
Initialize resources with the initiation interval II.
Definition MachinePipeliner.cpp:4140

llvm::ResourceManager::canReserveResources
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Definition MachinePipeliner.cpp:3910

llvm::SDep
Scheduling dependency.
Definition ScheduleDAG.h:51

llvm::SDep::Kind
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54

llvm::SDep::Order
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58

llvm::SDep::Anti
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56

llvm::SDep::Data
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition ScheduleDAG.h:147

llvm::SDep::Barrier
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71

llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74

llvm::SDep::setSUnit
void setSUnit(SUnit *SU)
Definition ScheduleDAG.h:510

llvm::SMSchedule
This class represents the scheduled code.
Definition MachinePipeliner.h:727

llvm::SMSchedule::reorderInstructions
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
Definition MachinePipeliner.cpp:3766

llvm::SMSchedule::setInitiationInterval
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
Definition MachinePipeliner.h:767

llvm::SMSchedule::dump
void dump() const
Utility function used for debugging to print the schedule.
Definition MachinePipeliner.cpp:3843

llvm::SMSchedule::insert
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
Definition MachinePipeliner.cpp:3149

llvm::SMSchedule::earliestCycleInChain
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
Definition MachinePipeliner.cpp:3189

llvm::SMSchedule::getMaxStageCount
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
Definition MachinePipeliner.h:824

llvm::SMSchedule::print
void print(raw_ostream &os) const
Print the schedule information to the given output.
Definition MachinePipeliner.cpp:3828

llvm::SMSchedule::reset
void reset()
Definition MachinePipeliner.h:758

llvm::SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
Definition MachinePipeliner.cpp:3482

llvm::SMSchedule::stageScheduled
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
Definition MachinePipeliner.h:808

llvm::SMSchedule::orderDependence
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
Definition MachinePipeliner.cpp:3300

llvm::SMSchedule::isValidSchedule
bool isValidSchedule(SwingSchedulerDAG *SSD)
Definition MachinePipeliner.cpp:3596

llvm::SMSchedule::getInitiationInterval
int getInitiationInterval() const
Return the initiation interval for this schedule.
Definition MachinePipeliner.h:773

llvm::SMSchedule::getInstructions
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
Definition MachinePipeliner.h:829

llvm::SMSchedule::getFirstCycle
int getFirstCycle() const
Return the first cycle in the completed schedule.
Definition MachinePipeliner.h:777

llvm::SMSchedule::const_sched_iterator
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
Definition MachinePipeliner.h:798

llvm::SMSchedule::isLoopCarriedDefOfUse
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
Definition MachinePipeliner.cpp:3460

llvm::SMSchedule::cycleScheduled
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
Definition MachinePipeliner.h:817

llvm::SMSchedule::computeUnpipelineableNodes
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
Definition MachinePipeliner.cpp:3491

llvm::SMSchedule::computeStart
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
Definition MachinePipeliner.cpp:3250

llvm::SMSchedule::normalizeNonPipelinedInstructions
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Definition MachinePipeliner.cpp:3521

llvm::SMSchedule::isLoopCarried
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
Definition MachinePipeliner.cpp:3431

llvm::SMSchedule::latestCycleInChain
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
Definition MachinePipeliner.cpp:3213

llvm::SMSchedule::getFinalCycle
int getFinalCycle() const
Return the last cycle in the finalized schedule.
Definition MachinePipeliner.h:780

llvm::SMSchedule::finalizeSchedule
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Definition MachinePipeliner.cpp:3785

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition ScheduleDAG.h:249

llvm::SUnit::NumPreds
unsigned NumPreds
Definition ScheduleDAG.h:279

llvm::SUnit::isInstr
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition ScheduleDAG.h:387

llvm::SUnit::NodeNum
unsigned NodeNum
Entry # of node in the node vector.
Definition ScheduleDAG.h:277

llvm::SUnit::setInstr
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition ScheduleDAG.h:391

llvm::SUnit::removePred
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
Definition ScheduleDAG.cpp:175

llvm::SUnit::isPred
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition ScheduleDAG.h:458

llvm::SUnit::Latency
unsigned short Latency
Node latency.
Definition ScheduleDAG.h:312

llvm::SUnit::isBoundaryNode
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition ScheduleDAG.h:367

llvm::SUnit::hasPhysRegDefs
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition ScheduleDAG.h:301

llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition ScheduleDAG.h:270

llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition ScheduleDAG.h:269

llvm::SUnit::addPred
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Definition ScheduleDAG.cpp:106

llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition ScheduleDAG.h:399

llvm::ScheduleDAGInstrs::MISUnitMap
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
Definition ScheduleDAGInstrs.h:158

llvm::ScheduleDAGInstrs::finishBlock
virtual void finishBlock()
Cleans up after scheduling in the given block.
Definition ScheduleDAGInstrs.cpp:193

llvm::ScheduleDAGInstrs::BB
MachineBasicBlock * BB
The block in which to insert instructions.
Definition ScheduleDAGInstrs.h:145

llvm::ScheduleDAGInstrs::buildSchedGraph
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
Definition ScheduleDAGInstrs.cpp:755

llvm::ScheduleDAGInstrs::getSUnit
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
Definition ScheduleDAGInstrs.h:424

llvm::ScheduleDAGInstrs::dump
void dump() const override
Definition ScheduleDAGInstrs.cpp:1206

llvm::ScheduleDAGTopologicalSort::AddPred
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
Definition ScheduleDAG.cpp:553

llvm::ScheduleDAGTopologicalSort::IsReachable
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
Definition ScheduleDAG.cpp:728

llvm::ScheduleDAG::MRI
MachineRegisterInfo & MRI
Virtual/real register map.
Definition ScheduleDAG.h:587

llvm::ScheduleDAG::TII
const TargetInstrInfo * TII
Target instruction information.
Definition ScheduleDAG.h:584

llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition ScheduleDAG.h:588

llvm::ScheduleDAG::TRI
const TargetRegisterInfo * TRI
Target processor register info.
Definition ScheduleDAG.h:585

llvm::ScheduleDAG::EntrySU
SUnit EntrySU
Special node for the region entry.
Definition ScheduleDAG.h:589

llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition ScheduleDAG.h:586

llvm::ScheduleDAG::ExitSU
SUnit ExitSU
Special node for the region exit.
Definition ScheduleDAG.h:590

llvm::SetVector
A vector that has set insertion semantics.
Definition SetVector.h:59

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102

llvm::SetVector::insert_range
void insert_range(Range &&R)
Definition SetVector.h:175

llvm::SetVector::iterator
typename vector_type::const_iterator iterator
Definition SetVector.h:71

llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition SetVector.h:266

llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261

llvm::SetVector::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150

llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251

llvm::SlotIndexes::insertMachineInstrInMaps
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition SlotIndexes.h:543

llvm::SmallPtrSetImplBase::clear
void clear()
Definition SmallPtrSet.h:102

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition SmallPtrSet.h:98

llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition SmallPtrSet.h:404

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition SmallPtrSet.h:455

llvm::SmallPtrSetImpl::end
iterator end() const
Definition SmallPtrSet.h:484

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition SmallPtrSet.h:478

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition SmallPtrSet.h:461

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:527

llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:573

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:673

llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition SmallVector.h:937

llvm::SmallVectorImpl::clear
void clear()
Definition SmallVector.h:610

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:638

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:272

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:79

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:270

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:82

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::SwingSchedulerDAG
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
Definition MachinePipeliner.h:274

llvm::SwingSchedulerDAG::applyInstrChange
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
Definition MachinePipeliner.cpp:2959

llvm::SwingSchedulerDAG::getDDG
const SwingSchedulerDDG * getDDG() const
Definition MachinePipeliner.h:443

llvm::SwingSchedulerDAG::finishBlock
void finishBlock() override
Clean up after the software pipeliner runs.
Definition MachinePipeliner.cpp:892

llvm::SwingSchedulerDAG::fixupRegisterOverlaps
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
Definition MachinePipeliner.cpp:3719

llvm::SwingSchedulerDAG::isLoopCarriedDep
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
Definition MachinePipeliner.cpp:3108

llvm::SwingSchedulerDAG::schedule
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
Definition MachinePipeliner.cpp:702

llvm::SwingSchedulerDAG::mayOverlapInLaterIter
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Definition MachinePipeliner.cpp:3013

llvm::SwingSchedulerDAG::getInstrBaseReg
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Definition MachinePipeliner.h:429

llvm::SwingSchedulerDDGEdge
Represents a dependence between two instruction.
Definition MachinePipeliner.h:119

llvm::SwingSchedulerDDGEdge::getDst
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
Definition MachinePipeliner.h:154

llvm::SwingSchedulerDDGEdge::ignoreDependence
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
Definition MachinePipeliner.cpp:4152

llvm::SwingSchedulerDDGEdge::getSrc
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
Definition MachinePipeliner.h:151

llvm::SwingSchedulerDDG
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
Definition MachinePipeliner.h:233

llvm::SwingSchedulerDDG::SwingSchedulerDDG
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
Definition MachinePipeliner.cpp:4204

llvm::SwingSchedulerDDG::getInEdges
const EdgesType & getInEdges(const SUnit *SU) const
Definition MachinePipeliner.cpp:4232

llvm::SwingSchedulerDDG::isValidSchedule
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
Definition MachinePipeliner.cpp:4242

llvm::SwingSchedulerDDG::getOutEdges
const EdgesType & getOutEdges(const SUnit *SU) const
Definition MachinePipeliner.cpp:4237

llvm::TargetInstrInfo::PipelinerLoopInfo
Object returned by analyzeLoopForPipelining.
Definition TargetInstrInfo.h:794

llvm::TargetInstrInfo::PipelinerLoopInfo::shouldIgnoreForPipelining
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition TargetInstrInfo.h:114

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition TargetPassConfig.h:84

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::TargetSubtargetInfo::enableMachinePipeliner
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
Definition TargetSubtargetInfo.h:202

llvm::TargetSubtargetInfo::useDFAforSMS
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
Definition TargetSubtargetInfo.h:282

llvm::TargetSubtargetInfo::getInstrInfo
virtual const TargetInstrInfo * getInstrInfo() const
Definition TargetSubtargetInfo.h:99

llvm::TargetSubtargetInfo::getRegisterInfo
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.

llvm::TargetSubtargetInfo::getInstrItineraryData
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
Definition TargetSubtargetInfo.h:138

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::WindowScheduler
The main class in the implementation of the target independent window scheduler.
Definition WindowScheduler.h:61

llvm::cl::opt
Definition CommandLine.h:1455

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175

llvm::iterator

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53

Changed
Changed
Definition ObjCARCOpts.cpp:2370

iterator_range.h
This provides a very simple, boring adaptor for a begin and end iterator into a range type.

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

false
Definition MachinePipeliner.cpp:244

llvm::AArch64CC::NE
@ NE
Definition AArch64BaseInfo.h:256

llvm::AArch64::RM
@ RM
Definition AArch64ISelLowering.h:34

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM_PROC::IE
@ IE
Definition ARMBaseInfo.h:27

llvm::COFF::Entry
@ Entry
Definition COFF.h:862

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::M68k::MemAddrModeKind::j
@ j
Definition M68kBaseInfo.h:52

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::MISched::BottomUp
@ BottomUp
Definition MachineScheduler.h:113

llvm::MISched::TopDown
@ TopDown
Definition MachineScheduler.h:112

llvm::RISCVFenceField::W
@ W
Definition RISCVBaseInfo.h:464

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:463

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SI
Definition SIInstrInfo.h:1745

llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used
@ Used
Definition CodeGen.h:136

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::ReallyHidden
@ ReallyHidden
Definition CommandLine.h:140

llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition CommandLine.h:713

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::cl::Prefix
@ Prefix
Definition CommandLine.h:160

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:495

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dwarf::Index
Index
Definition Dwarf.h:903

llvm::logicalview::LVComparePass::Added
@ Added
Definition LVObject.h:100

llvm::logicalview::LVAttributeKind::Inserted
@ Inserted
Definition LVOptions.h:109

llvm::mdconst::extract
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::numbers::e
constexpr double e
Definition MathExtras.h:47

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition OptimizationRemarkEmitter.h:139

llvm::pdb::PDB_MemoryType::Stack
@ Stack
Definition PDBTypes.h:328

llvm::pdb::PDB_LocType::Slot
@ Slot
Definition PDBTypes.h:300

llvm::pdb::PDB_ColorItem::Path
@ Path
Definition LinePrinter.h:166

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384

llvm::rdf::Phi
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390

llvm::rdf::Use
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385

llvm::rdf::NodeSet
std::set< NodeId > NodeSet
Definition RDFGraph.h:551

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm::sframe::BaseReg
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition SparseBitVector.h:874

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::stable_sort
void stable_sort(R &&Range)
Definition STLExtras.h:2058

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:369

llvm::NodeOrder
@ NodeOrder
Definition SIMachineScheduler.h:37

llvm::Successor
@ Successor
Definition SIMachineScheduler.h:35

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::set_is_subset
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition SetOperations.h:151

llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136

llvm::Desc
Op::Description Desc
Definition DWARFExpressionPrinter.cpp:23

llvm::erase
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128

llvm::Cycle
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::computeDelta
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
Definition RuntimeDyldMachO.cpp:309

llvm::WS_Force
@ WS_Force
Use window algorithm after SMS algorithm fails.
Definition WindowScheduler.h:56

llvm::WS_Off
@ WS_Off
Definition WindowScheduler.h:54

llvm::WS_On
@ WS_On
Turn off window algorithm.
Definition WindowScheduler.h:55

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::MCRegUnit
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1122

llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::SwpEnableCopyToPhi
cl::opt< bool > SwpEnableCopyToPhi

llvm::getRegState
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
Definition MachineInstrBuilder.h:563

llvm::lower_bound
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994

llvm::MachinePipelinerID
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
Definition MachinePipeliner.cpp:233

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::HighlightColor::Tag
@ Tag
Definition WithColor.h:30

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897

llvm::SwpForceIssueWidth
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::getUnderlyingObjects
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Definition ValueTracking.cpp:6730

llvm::isIdentifiedObject
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Definition AliasAnalysis.cpp:837

llvm::printReg
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition TargetRegisterInfo.cpp:105

llvm::popcount
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

raw_ostream.h

N
#define N

false::SUnitWithMemInfo
This class holds an SUnit corresponding to a memory operation and other information related to the in...
Definition MachinePipeliner.cpp:248

false::SUnitWithMemInfo::SU
SUnit * SU
Definition MachinePipeliner.cpp:249

false::SUnitWithMemInfo::MemOpValue
const Value * MemOpValue
The value of a memory operand.
Definition MachinePipeliner.cpp:253

false::SUnitWithMemInfo::UnderlyingObjs
SmallVector< const Value *, 2 > UnderlyingObjs
Definition MachinePipeliner.cpp:250

false::SUnitWithMemInfo::isTriviallyDisjoint
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
Definition MachinePipeliner.cpp:959

false::SUnitWithMemInfo::AATags
AAMDNodes AATags
Definition MachinePipeliner.cpp:258

false::SUnitWithMemInfo::MemOpOffset
int64_t MemOpOffset
The offset of a memory operand.
Definition MachinePipeliner.cpp:256

false::SUnitWithMemInfo::IsAllIdentified
bool IsAllIdentified
True if all the underlying objects are identified.
Definition MachinePipeliner.cpp:261

false::SUnitWithMemInfo::SUnitWithMemInfo
SUnitWithMemInfo(SUnit *SU)
Definition MachinePipeliner.cpp:949

false::SUnitWithMemInfo::isUnknown
bool isUnknown() const
Definition MachinePipeliner.cpp:267

llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761

llvm::InstrStage::FuncUnits
uint64_t FuncUnits
Bitmask representing a set of functional units.
Definition MCInstrItineraries.h:66

llvm::LaneBitmask::getNone
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81

llvm::LoopCarriedEdges
Represents loop-carried dependencies.
Definition MachinePipeliner.h:205

llvm::LoopCarriedEdges::OrderDep
SmallSetVector< SUnit *, 8 > OrderDep
Definition MachinePipeliner.h:206

llvm::LoopCarriedEdges::getOrderDepOrNull
const OrderDep * getOrderDepOrNull(SUnit *Key) const
Definition MachinePipeliner.h:211

llvm::LoopCarriedEdges::OrderDeps
OrderDepsType OrderDeps
Definition MachinePipeliner.h:209

llvm::LoopCarriedEdges::modifySUnits
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
Definition MachinePipeliner.cpp:4270

llvm::LoopCarriedEdges::dump
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Definition MachinePipeliner.cpp:4312

llvm::MCProcResourceDesc
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36

llvm::MCProcResourceDesc::Name
const char * Name
Definition MCSchedule.h:37

llvm::MCProcResourceDesc::NumUnits
unsigned NumUnits
Definition MCSchedule.h:38

llvm::MCSchedClassDesc
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123

llvm::MCSchedClassDesc::isValid
bool isValid() const
Definition MCSchedule.h:141

llvm::MCSchedClassDesc::NumMicroOps
uint16_t NumMicroOps
Definition MCSchedule.h:130

llvm::MCSchedModel
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258

llvm::MCSchedModel::getSchedClassDesc
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366

llvm::MCSchedModel::hasInstrSchedModel
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340

llvm::MCSchedModel::getProcResource
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359

llvm::MCWriteProcResEntry
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68

llvm::MachineSchedContext
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Definition MachineScheduler.h:143

llvm::MachineSchedContext::MF
MachineFunction * MF
Definition MachineScheduler.h:144

llvm::RegPressureDelta::Excess
PressureChange Excess
Definition RegisterPressure.h:244

llvm::RegisterPressure::MaxSetPressure
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.
Definition RegisterPressure.h:50

llvm::cl::desc
Definition CommandLine.h:411