LCOV - code coverage report
Current view: top level - lib/CodeGen - MachinePipeliner.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 1631 1707 95.5 %
Date: 2018-06-17 00:07:59 Functions: 96 97 99.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
      11             : //
      12             : // Software pipelining (SWP) is an instruction scheduling technique for loops
      13             : // that overlap loop iterations and exploits ILP via a compiler transformation.
      14             : //
      15             : // Swing Modulo Scheduling is an implementation of software pipelining
      16             : // that generates schedules that are near optimal in terms of initiation
      17             : // interval, register requirements, and stage count. See the papers:
      18             : //
      19             : // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
      20             : // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
      21             : // Conference on Parallel Architectures and Compilation Techiniques.
      22             : //
      23             : // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
      24             : // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
      25             : // Transactions on Computers, Vol. 50, No. 3, 2001.
      26             : //
      27             : // "An Implementation of Swing Modulo Scheduling With Extensions for
      28             : // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
      29             : // Urbana-Chambpain, 2005.
      30             : //
      31             : //
      32             : // The SMS algorithm consists of three main steps after computing the minimal
      33             : // initiation interval (MII).
      34             : // 1) Analyze the dependence graph and compute information about each
      35             : //    instruction in the graph.
      36             : // 2) Order the nodes (instructions) by priority based upon the heuristics
      37             : //    described in the algorithm.
      38             : // 3) Attempt to schedule the nodes in the specified order using the MII.
      39             : //
      40             : // This SMS implementation is a target-independent back-end pass. When enabled,
      41             : // the pass runs just prior to the register allocation pass, while the machine
      42             : // IR is in SSA form. If software pipelining is successful, then the original
      43             : // loop is replaced by the optimized loop. The optimized loop contains one or
      44             : // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
      45             : // the instructions cannot be scheduled in a given MII, we increase the MII by
      46             : // one and try again.
      47             : //
      48             : // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
      49             : // represent loop carried dependences in the DAG as order edges to the Phi
      50             : // nodes. We also perform several passes over the DAG to eliminate unnecessary
      51             : // edges that inhibit the ability to pipeline. The implementation uses the
      52             : // DFAPacketizer class to compute the minimum initiation interval and the check
      53             : // where an instruction may be inserted in the pipelined schedule.
      54             : //
      55             : // In order for the SMS pass to work, several target specific hooks need to be
      56             : // implemented to get information about the loop structure and to rewrite
      57             : // instructions.
      58             : //
      59             : //===----------------------------------------------------------------------===//
      60             : 
      61             : #include "llvm/ADT/ArrayRef.h"
      62             : #include "llvm/ADT/BitVector.h"
      63             : #include "llvm/ADT/DenseMap.h"
      64             : #include "llvm/ADT/MapVector.h"
      65             : #include "llvm/ADT/PriorityQueue.h"
      66             : #include "llvm/ADT/SetVector.h"
      67             : #include "llvm/ADT/SmallPtrSet.h"
      68             : #include "llvm/ADT/SmallSet.h"
      69             : #include "llvm/ADT/SmallVector.h"
      70             : #include "llvm/ADT/Statistic.h"
      71             : #include "llvm/ADT/iterator_range.h"
      72             : #include "llvm/Analysis/AliasAnalysis.h"
      73             : #include "llvm/Analysis/MemoryLocation.h"
      74             : #include "llvm/Analysis/ValueTracking.h"
      75             : #include "llvm/CodeGen/DFAPacketizer.h"
      76             : #include "llvm/CodeGen/LiveIntervals.h"
      77             : #include "llvm/CodeGen/MachineBasicBlock.h"
      78             : #include "llvm/CodeGen/MachineDominators.h"
      79             : #include "llvm/CodeGen/MachineFunction.h"
      80             : #include "llvm/CodeGen/MachineFunctionPass.h"
      81             : #include "llvm/CodeGen/MachineInstr.h"
      82             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      83             : #include "llvm/CodeGen/MachineLoopInfo.h"
      84             : #include "llvm/CodeGen/MachineMemOperand.h"
      85             : #include "llvm/CodeGen/MachineOperand.h"
      86             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      87             : #include "llvm/CodeGen/RegisterClassInfo.h"
      88             : #include "llvm/CodeGen/RegisterPressure.h"
      89             : #include "llvm/CodeGen/ScheduleDAG.h"
      90             : #include "llvm/CodeGen/ScheduleDAGInstrs.h"
      91             : #include "llvm/CodeGen/ScheduleDAGMutation.h"
      92             : #include "llvm/CodeGen/TargetInstrInfo.h"
      93             : #include "llvm/CodeGen/TargetOpcodes.h"
      94             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      95             : #include "llvm/CodeGen/TargetSubtargetInfo.h"
      96             : #include "llvm/Config/llvm-config.h"
      97             : #include "llvm/IR/Attributes.h"
      98             : #include "llvm/IR/DebugLoc.h"
      99             : #include "llvm/IR/Function.h"
     100             : #include "llvm/MC/LaneBitmask.h"
     101             : #include "llvm/MC/MCInstrDesc.h"
     102             : #include "llvm/MC/MCInstrItineraries.h"
     103             : #include "llvm/MC/MCRegisterInfo.h"
     104             : #include "llvm/Pass.h"
     105             : #include "llvm/Support/CommandLine.h"
     106             : #include "llvm/Support/Compiler.h"
     107             : #include "llvm/Support/Debug.h"
     108             : #include "llvm/Support/MathExtras.h"
     109             : #include "llvm/Support/raw_ostream.h"
     110             : #include <algorithm>
     111             : #include <cassert>
     112             : #include <climits>
     113             : #include <cstdint>
     114             : #include <deque>
     115             : #include <functional>
     116             : #include <iterator>
     117             : #include <map>
     118             : #include <memory>
     119             : #include <tuple>
     120             : #include <utility>
     121             : #include <vector>
     122             : 
     123             : using namespace llvm;
     124             : 
     125             : #define DEBUG_TYPE "pipeliner"
     126             : 
     127             : STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
     128             : STATISTIC(NumPipelined, "Number of loops software pipelined");
     129             : STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
     130             : 
     131             : /// A command line option to turn software pipelining on or off.
     132      303507 : static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
     133             :                                cl::ZeroOrMore,
     134      303507 :                                cl::desc("Enable Software Pipelining"));
     135             : 
     136             : /// A command line option to enable SWP at -Os.
     137      101169 : static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
     138      101169 :                                       cl::desc("Enable SWP at Os."), cl::Hidden,
     139      303507 :                                       cl::init(false));
     140             : 
     141             : /// A command line argument to limit minimum initial interval for pipelining.
     142      101169 : static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
     143      101169 :                               cl::desc("Size limit for the MII."),
     144      303507 :                               cl::Hidden, cl::init(27));
     145             : 
     146             : /// A command line argument to limit the number of stages in the pipeline.
     147             : static cl::opt<int>
     148      101169 :     SwpMaxStages("pipeliner-max-stages",
     149      101169 :                  cl::desc("Maximum stages allowed in the generated scheduled."),
     150      303507 :                  cl::Hidden, cl::init(3));
     151             : 
     152             : /// A command line option to disable the pruning of chain dependences due to
     153             : /// an unrelated Phi.
     154             : static cl::opt<bool>
     155      101169 :     SwpPruneDeps("pipeliner-prune-deps",
     156      101169 :                  cl::desc("Prune dependences between unrelated Phi nodes."),
     157      303507 :                  cl::Hidden, cl::init(true));
     158             : 
     159             : /// A command line option to disable the pruning of loop carried order
     160             : /// dependences.
     161             : static cl::opt<bool>
     162      101169 :     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
     163      101169 :                         cl::desc("Prune loop carried order dependences."),
     164      303507 :                         cl::Hidden, cl::init(true));
     165             : 
     166             : #ifndef NDEBUG
     167             : static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
     168             : #endif
     169             : 
     170      101169 : static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
     171      202338 :                                      cl::ReallyHidden, cl::init(false),
     172      303507 :                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
     173             : 
     174             : namespace {
     175             : 
     176             : class NodeSet;
     177             : class SMSchedule;
     178             : 
     179             : /// The main class in the implementation of the target independent
     180             : /// software pipeliner pass.
     181        2499 : class MachinePipeliner : public MachineFunctionPass {
     182             : public:
     183             :   MachineFunction *MF = nullptr;
     184             :   const MachineLoopInfo *MLI = nullptr;
     185             :   const MachineDominatorTree *MDT = nullptr;
     186             :   const InstrItineraryData *InstrItins;
     187             :   const TargetInstrInfo *TII = nullptr;
     188             :   RegisterClassInfo RegClassInfo;
     189             : 
     190             : #ifndef NDEBUG
     191             :   static int NumTries;
     192             : #endif
     193             : 
     194             :   /// Cache the target analysis information about the loop.
     195        2501 :   struct LoopInfo {
     196             :     MachineBasicBlock *TBB = nullptr;
     197             :     MachineBasicBlock *FBB = nullptr;
     198             :     SmallVector<MachineOperand, 4> BrCond;
     199             :     MachineInstr *LoopInductionVar = nullptr;
     200             :     MachineInstr *LoopCompare = nullptr;
     201             :   };
     202             :   LoopInfo LI;
     203             : 
     204             :   static char ID;
     205             : 
     206         834 :   MachinePipeliner() : MachineFunctionPass(ID) {
     207         834 :     initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
     208         834 :   }
     209             : 
     210             :   bool runOnMachineFunction(MachineFunction &MF) override;
     211             : 
     212         829 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     213             :     AU.addRequired<AAResultsWrapperPass>();
     214             :     AU.addPreserved<AAResultsWrapperPass>();
     215             :     AU.addRequired<MachineLoopInfo>();
     216             :     AU.addRequired<MachineDominatorTree>();
     217             :     AU.addRequired<LiveIntervals>();
     218         829 :     MachineFunctionPass::getAnalysisUsage(AU);
     219         829 :   }
     220             : 
     221             : private:
     222             :   void preprocessPhiNodes(MachineBasicBlock &B);
     223             :   bool canPipelineLoop(MachineLoop &L);
     224             :   bool scheduleLoop(MachineLoop &L);
     225             :   bool swingModuloScheduler(MachineLoop &L);
     226             : };
     227             : 
     228             : /// This class builds the dependence graph for the instructions in a loop,
     229             : /// and attempts to schedule the instructions using the SMS algorithm.
     230         579 : class SwingSchedulerDAG : public ScheduleDAGInstrs {
     231             :   MachinePipeliner &Pass;
     232             :   /// The minimum initiation interval between iterations for this schedule.
     233             :   unsigned MII = 0;
     234             :   /// Set to true if a valid pipelined schedule is found for the loop.
     235             :   bool Scheduled = false;
     236             :   MachineLoop &Loop;
     237             :   LiveIntervals &LIS;
     238             :   const RegisterClassInfo &RegClassInfo;
     239             : 
     240             :   /// A toplogical ordering of the SUnits, which is needed for changing
     241             :   /// dependences and iterating over the SUnits.
     242             :   ScheduleDAGTopologicalSort Topo;
     243             : 
     244             :   struct NodeInfo {
     245             :     int ASAP = 0;
     246             :     int ALAP = 0;
     247             :     int ZeroLatencyDepth = 0;
     248             :     int ZeroLatencyHeight = 0;
     249             : 
     250             :     NodeInfo() = default;
     251             :   };
     252             :   /// Computed properties for each node in the graph.
     253             :   std::vector<NodeInfo> ScheduleInfo;
     254             : 
     255             :   enum OrderKind { BottomUp = 0, TopDown = 1 };
     256             :   /// Computed node ordering for scheduling.
     257             :   SetVector<SUnit *> NodeOrder;
     258             : 
     259             :   using NodeSetType = SmallVector<NodeSet, 8>;
     260             :   using ValueMapTy = DenseMap<unsigned, unsigned>;
     261             :   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
     262             :   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
     263             : 
     264             :   /// Instructions to change when emitting the final schedule.
     265             :   DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
     266             : 
     267             :   /// We may create a new instruction, so remember it because it
     268             :   /// must be deleted when the pass is finished.
     269             :   SmallPtrSet<MachineInstr *, 4> NewMIs;
     270             : 
     271             :   /// Ordered list of DAG postprocessing steps.
     272             :   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
     273             : 
     274             :   /// Helper class to implement Johnson's circuit finding algorithm.
     275         386 :   class Circuits {
     276             :     std::vector<SUnit> &SUnits;
     277             :     SetVector<SUnit *> Stack;
     278             :     BitVector Blocked;
     279             :     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
     280             :     SmallVector<SmallVector<int, 4>, 16> AdjK;
     281             :     unsigned NumPaths;
     282             :     static unsigned MaxPaths;
     283             : 
     284             :   public:
     285         193 :     Circuits(std::vector<SUnit> &SUs)
     286        1351 :         : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
     287             : 
     288             :     /// Reset the data structures used in the circuit algorithm.
     289        2066 :     void reset() {
     290             :       Stack.clear();
     291             :       Blocked.reset();
     292        8264 :       B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
     293        2066 :       NumPaths = 0;
     294        2066 :     }
     295             : 
     296             :     void createAdjacencyStructure(SwingSchedulerDAG *DAG);
     297             :     bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
     298             :     void unblock(int U);
     299             :   };
     300             : 
     301             : public:
     302         193 :   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
     303             :                     const RegisterClassInfo &rci)
     304         386 :       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
     305         772 :         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
     306         193 :     P.MF->getSubtarget().getSMSMutations(Mutations);
     307         193 :   }
     308             : 
     309             :   void schedule() override;
     310             :   void finishBlock() override;
     311             : 
     312             :   /// Return true if the loop kernel has been scheduled.
     313             :   bool hasNewSchedule() { return Scheduled; }
     314             : 
     315             :   /// Return the earliest time an instruction may be scheduled.
     316       10286 :   int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
     317             : 
     318             :   /// Return the latest time an instruction my be scheduled.
     319        9074 :   int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
     320             : 
     321             :   /// The mobility function, which the number of slots in which
     322             :   /// an instruction may be scheduled.
     323        4532 :   int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
     324             : 
     325             :   /// The depth, in the dependence graph, for a node.
     326             :   unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
     327             : 
     328             :   /// The maximum unweighted length of a path from an arbitrary node to the
     329             :   /// given node in which each edge has latency 0
     330             :   int getZeroLatencyDepth(SUnit *Node) {
     331        8328 :     return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
     332             :   }
     333             : 
     334             :   /// The height, in the dependence graph, for a node.
     335             :   unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
     336             : 
     337             :   /// The maximum unweighted length of a path from the given node to an
     338             :   /// arbitrary node in which each edge has latency 0
     339             :   int getZeroLatencyHeight(SUnit *Node) {
     340        6700 :     return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
     341             :   }
     342             : 
     343             :   /// Return true if the dependence is a back-edge in the data dependence graph.
     344             :   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
     345             :   /// using an anti dependence from a Phi to an instruction.
     346       86537 :   bool isBackedge(SUnit *Source, const SDep &Dep) {
     347       86537 :     if (Dep.getKind() != SDep::Anti)
     348             :       return false;
     349       15125 :     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
     350             :   }
     351             : 
     352             :   bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
     353             : 
     354             :   /// The distance function, which indicates that operation V of iteration I
     355             :   /// depends on operations U of iteration I-distance.
     356             :   unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
     357             :     // Instructions that feed a Phi have a distance of 1. Computing larger
     358             :     // values for arrays requires data dependence information.
     359        4735 :     if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
     360             :       return 1;
     361             :     return 0;
     362             :   }
     363             : 
     364             :   /// Set the Minimum Initiation Interval for this schedule attempt.
     365             :   void setMII(unsigned mii) { MII = mii; }
     366             : 
     367             :   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
     368             : 
     369             :   void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
     370             : 
     371             :   /// Return the new base register that was stored away for the changed
     372             :   /// instruction.
     373             :   unsigned getInstrBaseReg(SUnit *SU) {
     374             :     DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
     375         461 :         InstrChanges.find(SU);
     376         461 :     if (It != InstrChanges.end())
     377          56 :       return It->second.first;
     378             :     return 0;
     379             :   }
     380             : 
     381             :   void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
     382             :     Mutations.push_back(std::move(Mutation));
     383             :   }
     384             : 
     385             : private:
     386             :   void addLoopCarriedDependences(AliasAnalysis *AA);
     387             :   void updatePhiDependences();
     388             :   void changeDependences();
     389             :   unsigned calculateResMII();
     390             :   unsigned calculateRecMII(NodeSetType &RecNodeSets);
     391             :   void findCircuits(NodeSetType &NodeSets);
     392             :   void fuseRecs(NodeSetType &NodeSets);
     393             :   void removeDuplicateNodes(NodeSetType &NodeSets);
     394             :   void computeNodeFunctions(NodeSetType &NodeSets);
     395             :   void registerPressureFilter(NodeSetType &NodeSets);
     396             :   void colocateNodeSets(NodeSetType &NodeSets);
     397             :   void checkNodeSets(NodeSetType &NodeSets);
     398             :   void groupRemainingNodes(NodeSetType &NodeSets);
     399             :   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
     400             :                          SetVector<SUnit *> &NodesAdded);
     401             :   void computeNodeOrder(NodeSetType &NodeSets);
     402             :   void checkValidNodeOrder(const NodeSetType &Circuits) const;
     403             :   bool schedulePipeline(SMSchedule &Schedule);
     404             :   void generatePipelinedLoop(SMSchedule &Schedule);
     405             :   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
     406             :                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
     407             :                       MBBVectorTy &PrologBBs);
     408             :   void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
     409             :                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
     410             :                       MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
     411             :   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
     412             :                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
     413             :                             SMSchedule &Schedule, ValueMapTy *VRMap,
     414             :                             InstrMapTy &InstrMap, unsigned LastStageNum,
     415             :                             unsigned CurStageNum, bool IsLast);
     416             :   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
     417             :                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
     418             :                     SMSchedule &Schedule, ValueMapTy *VRMap,
     419             :                     InstrMapTy &InstrMap, unsigned LastStageNum,
     420             :                     unsigned CurStageNum, bool IsLast);
     421             :   void removeDeadInstructions(MachineBasicBlock *KernelBB,
     422             :                               MBBVectorTy &EpilogBBs);
     423             :   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
     424             :                       SMSchedule &Schedule);
     425             :   void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
     426             :                    MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
     427             :                    ValueMapTy *VRMap);
     428             :   bool computeDelta(MachineInstr &MI, unsigned &Delta);
     429             :   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
     430             :                          unsigned Num);
     431             :   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
     432             :                            unsigned InstStageNum);
     433             :   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
     434             :                                     unsigned InstStageNum,
     435             :                                     SMSchedule &Schedule);
     436             :   void updateInstruction(MachineInstr *NewMI, bool LastDef,
     437             :                          unsigned CurStageNum, unsigned InstStageNum,
     438             :                          SMSchedule &Schedule, ValueMapTy *VRMap);
     439             :   MachineInstr *findDefInLoop(unsigned Reg);
     440             :   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
     441             :                          unsigned LoopStage, ValueMapTy *VRMap,
     442             :                          MachineBasicBlock *BB);
     443             :   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
     444             :                         SMSchedule &Schedule, ValueMapTy *VRMap,
     445             :                         InstrMapTy &InstrMap);
     446             :   void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
     447             :                              InstrMapTy &InstrMap, unsigned CurStageNum,
     448             :                              unsigned PhiNum, MachineInstr *Phi,
     449             :                              unsigned OldReg, unsigned NewReg,
     450             :                              unsigned PrevReg = 0);
     451             :   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
     452             :                              unsigned &OffsetPos, unsigned &NewBase,
     453             :                              int64_t &NewOffset);
     454             :   void postprocessDAG();
     455             : };
     456             : 
     457             : /// A NodeSet contains a set of SUnit DAG nodes with additional information
     458             : /// that assigns a priority to the set.
     459        9177 : class NodeSet {
     460             :   SetVector<SUnit *> Nodes;
     461             :   bool HasRecurrence = false;
     462             :   unsigned RecMII = 0;
     463             :   int MaxMOV = 0;
     464             :   unsigned MaxDepth = 0;
     465             :   unsigned Colocate = 0;
     466             :   SUnit *ExceedPressure = nullptr;
     467             :   unsigned Latency = 0;
     468             : 
     469             : public:
     470             :   using iterator = SetVector<SUnit *>::const_iterator;
     471             : 
     472         191 :   NodeSet() = default;
     473         542 :   NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
     474             :     Latency = 0;
     475        2306 :     for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
     476       11636 :       for (const SDep &Succ : Nodes[i]->Succs)
     477        3172 :         if (Nodes.count(Succ.getSUnit()))
     478        1938 :           Latency += Succ.getLatency();
     479         542 :   }
     480             : 
     481        1259 :   bool insert(SUnit *SU) { return Nodes.insert(SU); }
     482             : 
     483          81 :   void insert(iterator S, iterator E) { Nodes.insert(S, E); }
     484             : 
     485             :   template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
     486         733 :     return Nodes.remove_if(P);
     487             :   }
     488             : 
     489             :   unsigned count(SUnit *SU) const { return Nodes.count(SU); }
     490             : 
     491             :   bool hasRecurrence() { return HasRecurrence; };
     492             : 
     493         416 :   unsigned size() const { return Nodes.size(); }
     494             : 
     495             :   bool empty() const { return Nodes.empty(); }
     496             : 
     497        2641 :   SUnit *getNode(unsigned i) const { return Nodes[i]; };
     498             : 
     499         563 :   void setRecMII(unsigned mii) { RecMII = mii; };
     500             : 
     501          26 :   void setColocate(unsigned c) { Colocate = c; };
     502             : 
     503           2 :   void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
     504             : 
     505             :   bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
     506             : 
     507         121 :   int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
     508             : 
     509           1 :   int getRecMII() { return RecMII; }
     510             : 
     511             :   /// Summarize node functions for the entire node set.
     512         416 :   void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
     513        1556 :     for (SUnit *SU : *this) {
     514        2280 :       MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
     515        2280 :       MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
     516             :     }
     517         416 :   }
     518             : 
     519             :   unsigned getLatency() { return Latency; }
     520             : 
     521             :   unsigned getMaxDepth() { return MaxDepth; }
     522             : 
     523             :   void clear() {
     524             :     Nodes.clear();
     525         201 :     RecMII = 0;
     526         201 :     HasRecurrence = false;
     527         201 :     MaxMOV = 0;
     528         201 :     MaxDepth = 0;
     529         201 :     Colocate = 0;
     530         201 :     ExceedPressure = nullptr;
     531             :   }
     532             : 
     533        1102 :   operator SetVector<SUnit *> &() { return Nodes; }
     534             : 
     535             :   /// Sort the node sets by importance. First, rank them by recurrence MII,
     536             :   /// then by mobility (least mobile done first), and finally by depth.
     537             :   /// Each node set may contain a colocate value which is used as the first
     538             :   /// tie breaker, if it's set.
     539             :   bool operator>(const NodeSet &RHS) const {
     540         467 :     if (RecMII == RHS.RecMII) {
     541         325 :       if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
     542          13 :         return Colocate < RHS.Colocate;
     543         312 :       if (MaxMOV == RHS.MaxMOV)
     544         149 :         return MaxDepth > RHS.MaxDepth;
     545         163 :       return MaxMOV < RHS.MaxMOV;
     546             :     }
     547         142 :     return RecMII > RHS.RecMII;
     548             :   }
     549             : 
     550             :   bool operator==(const NodeSet &RHS) const {
     551             :     return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
     552             :            MaxDepth == RHS.MaxDepth;
     553             :   }
     554             : 
     555             :   bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
     556             : 
     557             :   iterator begin() { return Nodes.begin(); }
     558             :   iterator end() { return Nodes.end(); }
     559             : 
     560             :   void print(raw_ostream &os) const {
     561             :     os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
     562             :        << " depth " << MaxDepth << " col " << Colocate << "\n";
     563             :     for (const auto &I : Nodes)
     564             :       os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
     565             :     os << "\n";
     566             :   }
     567             : 
     568             : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     569             :   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
     570             : #endif
     571             : };
     572             : 
     573             : /// This class repesents the scheduled code.  The main data structure is a
     574             : /// map from scheduled cycle to instructions.  During scheduling, the
     575             : /// data structure explicitly represents all stages/iterations.   When
     576             : /// the algorithm finshes, the schedule is collapsed into a single stage,
     577             : /// which represents instructions from different loop iterations.
     578             : ///
     579             : /// The SMS algorithm allows negative values for cycles, so the first cycle
     580             : /// in the schedule is the smallest cycle value.
     581         573 : class SMSchedule {
     582             : private:
     583             :   /// Map from execution cycle to instructions.
     584             :   DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
     585             : 
     586             :   /// Map from instruction to execution cycle.
     587             :   std::map<SUnit *, int> InstrToCycle;
     588             : 
     589             :   /// Map for each register and the max difference between its uses and def.
     590             :   /// The first element in the pair is the max difference in stages. The
     591             :   /// second is true if the register defines a Phi value and loop value is
     592             :   /// scheduled before the Phi.
     593             :   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
     594             : 
     595             :   /// Keep track of the first cycle value in the schedule.  It starts
     596             :   /// as zero, but the algorithm allows negative values.
     597             :   int FirstCycle = 0;
     598             : 
     599             :   /// Keep track of the last cycle value in the schedule.
     600             :   int LastCycle = 0;
     601             : 
     602             :   /// The initiation interval (II) for the schedule.
     603             :   int InitiationInterval = 0;
     604             : 
     605             :   /// Target machine information.
     606             :   const TargetSubtargetInfo &ST;
     607             : 
     608             :   /// Virtual register information.
     609             :   MachineRegisterInfo &MRI;
     610             : 
     611             :   std::unique_ptr<DFAPacketizer> Resources;
     612             : 
     613             : public:
     614         191 :   SMSchedule(MachineFunction *mf)
     615         573 :       : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
     616         955 :         Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
     617             : 
     618         279 :   void reset() {
     619         279 :     ScheduledInstrs.clear();
     620             :     InstrToCycle.clear();
     621             :     RegToStageDiff.clear();
     622         279 :     FirstCycle = 0;
     623         279 :     LastCycle = 0;
     624         279 :     InitiationInterval = 0;
     625         279 :   }
     626             : 
     627             :   /// Set the initiation interval for this schedule.
     628         274 :   void setInitiationInterval(int ii) { InitiationInterval = ii; }
     629             : 
     630             :   /// Return the first cycle in the completed schedule.  This
     631             :   /// can be a negative value.
     632             :   int getFirstCycle() const { return FirstCycle; }
     633             : 
     634             :   /// Return the last cycle in the finalized schedule.
     635        1181 :   int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
     636             : 
     637             :   /// Return the cycle of the earliest scheduled instruction in the dependence
     638             :   /// chain.
     639             :   int earliestCycleInChain(const SDep &Dep);
     640             : 
     641             :   /// Return the cycle of the latest scheduled instruction in the dependence
     642             :   /// chain.
     643             :   int latestCycleInChain(const SDep &Dep);
     644             : 
     645             :   void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
     646             :                     int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
     647             :   bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
     648             : 
     649             :   /// Iterators for the cycle to instruction map.
     650             :   using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
     651             :   using const_sched_iterator =
     652             :       DenseMap<int, std::deque<SUnit *>>::const_iterator;
     653             : 
     654             :   /// Return true if the instruction is scheduled at the specified stage.
     655             :   bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
     656        5134 :     return (stageScheduled(SU) == (int)StageNum);
     657             :   }
     658             : 
     659             :   /// Return the stage for a scheduled instruction.  Return -1 if
     660             :   /// the instruction has not been scheduled.
     661       31281 :   int stageScheduled(SUnit *SU) const {
     662             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
     663       31281 :     if (it == InstrToCycle.end())
     664             :       return -1;
     665       29672 :     return (it->second - FirstCycle) / InitiationInterval;
     666             :   }
     667             : 
     668             :   /// Return the cycle for a scheduled instruction. This function normalizes
     669             :   /// the first cycle to be 0.
     670             :   unsigned cycleScheduled(SUnit *SU) const {
     671             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
     672             :     assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
     673        6308 :     return (it->second - FirstCycle) % InitiationInterval;
     674             :   }
     675             : 
     676             :   /// Return the maximum stage count needed for this schedule.
     677             :   unsigned getMaxStageCount() {
     678        9491 :     return (LastCycle - FirstCycle) / InitiationInterval;
     679             :   }
     680             : 
     681             :   /// Return the max. number of stages/iterations that can occur between a
     682             :   /// register definition and its uses.
     683        3269 :   unsigned getStagesForReg(int Reg, unsigned CurStage) {
     684        3269 :     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
     685        6538 :     if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
     686             :       return 1;
     687        3265 :     return Stages.first;
     688             :   }
     689             : 
     690             :   /// The number of stages for a Phi is a little different than other
     691             :   /// instructions. The minimum value computed in RegToStageDiff is 1
     692             :   /// because we assume the Phi is needed for at least 1 iteration.
     693             :   /// This is not the case if the loop value is scheduled prior to the
     694             :   /// Phi in the same stage.  This function returns the number of stages
     695             :   /// or iterations needed between the Phi definition and any uses.
     696             :   unsigned getStagesForPhi(int Reg) {
     697         464 :     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
     698         464 :     if (Stages.second)
     699             :       return Stages.first;
     700         433 :     return Stages.first - 1;
     701             :   }
     702             : 
     703             :   /// Return the instructions that are scheduled at the specified cycle.
     704             :   std::deque<SUnit *> &getInstructions(int cycle) {
     705       23912 :     return ScheduledInstrs[cycle];
     706             :   }
     707             : 
     708             :   bool isValidSchedule(SwingSchedulerDAG *SSD);
     709             :   void finalizeSchedule(SwingSchedulerDAG *SSD);
     710             :   void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
     711             :                        std::deque<SUnit *> &Insts);
     712             :   bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
     713             :   bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
     714             :                              MachineOperand &MO);
     715             :   void print(raw_ostream &os) const;
     716             :   void dump() const;
     717             : };
     718             : 
     719             : } // end anonymous namespace
     720             : 
     721             : unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
     722             : char MachinePipeliner::ID = 0;
     723             : #ifndef NDEBUG
     724             : int MachinePipeliner::NumTries = 0;
     725             : #endif
     726             : char &llvm::MachinePipelinerID = MachinePipeliner::ID;
     727             : 
     728       26770 : INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
     729             :                       "Modulo Software Pipelining", false, false)
     730       26770 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     731       26770 : INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
     732       26770 : INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
     733       26770 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
     734      149780 : INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
     735             :                     "Modulo Software Pipelining", false, false)
     736             : 
     737             : /// The "main" function for implementing Swing Modulo Scheduling.
     738        3289 : bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
     739        3289 :   if (skipFunction(mf.getFunction()))
     740             :     return false;
     741             : 
     742        3279 :   if (!EnableSWP)
     743             :     return false;
     744             : 
     745        3300 :   if (mf.getFunction().getAttributes().hasAttribute(
     746        3302 :           AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
     747          32 :       !EnableSWPOptSize.getPosition())
     748             :     return false;
     749             : 
     750        3240 :   MF = &mf;
     751        3240 :   MLI = &getAnalysis<MachineLoopInfo>();
     752        3240 :   MDT = &getAnalysis<MachineDominatorTree>();
     753        3240 :   TII = MF->getSubtarget().getInstrInfo();
     754        3240 :   RegClassInfo.runOnMachineFunction(*MF);
     755             : 
     756        6845 :   for (auto &L : *MLI)
     757         365 :     scheduleLoop(*L);
     758             : 
     759             :   return false;
     760             : }
     761             : 
     762             : /// Attempt to perform the SMS algorithm on the specified loop. This function is
     763             : /// the main entry point for the algorithm.  The function identifies candidate
     764             : /// loops, calculates the minimum initiation interval, and attempts to schedule
     765             : /// the loop.
     766         405 : bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
     767             :   bool Changed = false;
     768         445 :   for (auto &InnerLoop : L)
     769          40 :     Changed |= scheduleLoop(*InnerLoop);
     770             : 
     771             : #ifndef NDEBUG
     772             :   // Stop trying after reaching the limit (if any).
     773             :   int Limit = SwpLoopLimit;
     774             :   if (Limit >= 0) {
     775             :     if (NumTries >= SwpLoopLimit)
     776             :       return Changed;
     777             :     NumTries++;
     778             :   }
     779             : #endif
     780             : 
     781         405 :   if (!canPipelineLoop(L))
     782             :     return Changed;
     783             : 
     784             :   ++NumTrytoPipeline;
     785             : 
     786         193 :   Changed = swingModuloScheduler(L);
     787             : 
     788         193 :   return Changed;
     789             : }
     790             : 
     791             : /// Return true if the loop can be software pipelined.  The algorithm is
     792             : /// restricted to loops with a single basic block.  Make sure that the
     793             : /// branch in the loop can be analyzed.
     794         405 : bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
     795         405 :   if (L.getNumBlocks() != 1)
     796             :     return false;
     797             : 
     798             :   // Check if the branch can't be understood because we can't do pipelining
     799             :   // if that's the case.
     800         337 :   LI.TBB = nullptr;
     801         337 :   LI.FBB = nullptr;
     802         337 :   LI.BrCond.clear();
     803         674 :   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
     804             :     return false;
     805             : 
     806         337 :   LI.LoopInductionVar = nullptr;
     807         337 :   LI.LoopCompare = nullptr;
     808         337 :   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
     809             :     return false;
     810             : 
     811         193 :   if (!L.getLoopPreheader())
     812             :     return false;
     813             : 
     814             :   // Remove any subregisters from inputs to phi nodes.
     815         193 :   preprocessPhiNodes(*L.getHeader());
     816         193 :   return true;
     817             : }
     818             : 
     819         193 : void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
     820         193 :   MachineRegisterInfo &MRI = MF->getRegInfo();
     821         193 :   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
     822             : 
     823        1010 :   for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
     824         431 :     MachineOperand &DefOp = PI.getOperand(0);
     825             :     assert(DefOp.getSubReg() == 0);
     826         431 :     auto *RC = MRI.getRegClass(DefOp.getReg());
     827             : 
     828        1293 :     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
     829         862 :       MachineOperand &RegOp = PI.getOperand(i);
     830         862 :       if (RegOp.getSubReg() == 0)
     831         851 :         continue;
     832             : 
     833             :       // If the operand uses a subregister, replace it with a new register
     834             :       // without subregisters, and generate a copy to the new register.
     835          11 :       unsigned NewReg = MRI.createVirtualRegister(RC);
     836          22 :       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
     837          11 :       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
     838             :       const DebugLoc &DL = PredB.findDebugLoc(At);
     839          33 :       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
     840             :                     .addReg(RegOp.getReg(), getRegState(RegOp),
     841          11 :                             RegOp.getSubReg());
     842          11 :       Slots.insertMachineInstrInMaps(*Copy);
     843          11 :       RegOp.setReg(NewReg);
     844             :       RegOp.setSubReg(0);
     845             :     }
     846             :   }
     847         193 : }
     848             : 
     849             : /// The SMS algorithm consists of the following main steps:
     850             : /// 1. Computation and analysis of the dependence graph.
     851             : /// 2. Ordering of the nodes (instructions).
     852             : /// 3. Attempt to Schedule the loop.
     853         193 : bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
     854             :   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
     855             : 
     856         386 :   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
     857             : 
     858             :   MachineBasicBlock *MBB = L.getHeader();
     859             :   // The kernel should not include any terminator instructions.  These
     860             :   // will be added back later.
     861         193 :   SMS.startBlock(MBB);
     862             : 
     863             :   // Compute the number of 'real' instructions in the basic block by
     864             :   // ignoring terminators.
     865             :   unsigned size = MBB->size();
     866         579 :   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
     867             :                                    E = MBB->instr_end();
     868         579 :        I != E; ++I, --size)
     869             :     ;
     870             : 
     871         386 :   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
     872         193 :   SMS.schedule();
     873         193 :   SMS.exitRegion();
     874             : 
     875         193 :   SMS.finishBlock();
     876         386 :   return SMS.hasNewSchedule();
     877             : }
     878             : 
     879             : /// We override the schedule function in ScheduleDAGInstrs to implement the
     880             : /// scheduling part of the Swing Modulo Scheduling algorithm.
     881         193 : void SwingSchedulerDAG::schedule() {
     882         193 :   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
     883         193 :   buildSchedGraph(AA);
     884         193 :   addLoopCarriedDependences(AA);
     885         193 :   updatePhiDependences();
     886         193 :   Topo.InitDAGTopologicalSorting();
     887             :   postprocessDAG();
     888         193 :   changeDependences();
     889             :   LLVM_DEBUG({
     890             :     for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
     891             :       SUnits[su].dumpAll(this);
     892             :   });
     893             : 
     894         130 :   NodeSetType NodeSets;
     895         193 :   findCircuits(NodeSets);
     896         130 :   NodeSetType Circuits = NodeSets;
     897             : 
     898             :   // Calculate the MII.
     899         193 :   unsigned ResMII = calculateResMII();
     900         193 :   unsigned RecMII = calculateRecMII(NodeSets);
     901             : 
     902         193 :   fuseRecs(NodeSets);
     903             : 
     904             :   // This flag is used for testing and can cause correctness problems.
     905         193 :   if (SwpIgnoreRecMII)
     906           1 :     RecMII = 0;
     907             : 
     908         193 :   MII = std::max(ResMII, RecMII);
     909             :   LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII
     910             :                     << ", res=" << ResMII << ")\n");
     911             : 
     912             :   // Can't schedule a loop without a valid MII.
     913         193 :   if (MII == 0)
     914          63 :     return;
     915             : 
     916             :   // Don't pipeline large loops.
     917         193 :   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
     918             :     return;
     919             : 
     920         191 :   computeNodeFunctions(NodeSets);
     921             : 
     922         191 :   registerPressureFilter(NodeSets);
     923             : 
     924         191 :   colocateNodeSets(NodeSets);
     925             : 
     926         191 :   checkNodeSets(NodeSets);
     927             : 
     928             :   LLVM_DEBUG({
     929             :     for (auto &I : NodeSets) {
     930             :       dbgs() << "  Rec NodeSet ";
     931             :       I.dump();
     932             :     }
     933             :   });
     934             : 
     935             :   std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
     936             : 
     937         191 :   groupRemainingNodes(NodeSets);
     938             : 
     939         191 :   removeDuplicateNodes(NodeSets);
     940             : 
     941             :   LLVM_DEBUG({
     942             :     for (auto &I : NodeSets) {
     943             :       dbgs() << "  NodeSet ";
     944             :       I.dump();
     945             :     }
     946             :   });
     947             : 
     948         191 :   computeNodeOrder(NodeSets);
     949             : 
     950             :   // check for node order issues
     951         191 :   checkValidNodeOrder(Circuits);
     952             : 
     953         321 :   SMSchedule Schedule(Pass.MF);
     954         191 :   Scheduled = schedulePipeline(Schedule);
     955             : 
     956         191 :   if (!Scheduled)
     957          61 :     return;
     958             : 
     959         130 :   unsigned numStages = Schedule.getMaxStageCount();
     960             :   // No need to generate pipeline if there are no overlapped iterations.
     961         130 :   if (numStages == 0)
     962             :     return;
     963             : 
     964             :   // Check that the maximum stage count is less than user-defined limit.
     965         130 :   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
     966             :     return;
     967             : 
     968         130 :   generatePipelinedLoop(Schedule);
     969             :   ++NumPipelined;
     970             : }
     971             : 
     972             : /// Clean up after the software pipeliner runs.
     973         193 : void SwingSchedulerDAG::finishBlock() {
     974         193 :   for (MachineInstr *I : NewMIs)
     975          22 :     MF.DeleteMachineInstr(I);
     976         193 :   NewMIs.clear();
     977             : 
     978             :   // Call the superclass.
     979         193 :   ScheduleDAGInstrs::finishBlock();
     980         193 : }
     981             : 
     982             : /// Return the register values for  the operands of a Phi instruction.
     983             : /// This function assume the instruction is a Phi.
     984             : static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
     985             :                        unsigned &InitVal, unsigned &LoopVal) {
     986             :   assert(Phi.isPHI() && "Expecting a Phi.");
     987             : 
     988             :   InitVal = 0;
     989             :   LoopVal = 0;
     990        8475 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     991       11300 :     if (Phi.getOperand(i + 1).getMBB() != Loop)
     992        1120 :       InitVal = Phi.getOperand(i).getReg();
     993             :     else
     994        2825 :       LoopVal = Phi.getOperand(i).getReg();
     995             : 
     996             :   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
     997             : }
     998             : 
     999             : /// Return the Phi register value that comes from the incoming block.
    1000             : static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
    1001         348 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
    1002         696 :     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
    1003         348 :       return Phi.getOperand(i).getReg();
    1004             :   return 0;
    1005             : }
    1006             : 
    1007             : /// Return the Phi register value that comes the loop block.
    1008             : static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
    1009        3571 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
    1010        6488 :     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
    1011        1543 :       return Phi.getOperand(i).getReg();
    1012             :   return 0;
    1013             : }
    1014             : 
    1015             : /// Return true if SUb can be reached from SUa following the chain edges.
    1016         368 : static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
    1017             :   SmallPtrSet<SUnit *, 8> Visited;
    1018             :   SmallVector<SUnit *, 8> Worklist;
    1019         368 :   Worklist.push_back(SUa);
    1020         561 :   while (!Worklist.empty()) {
    1021             :     const SUnit *SU = Worklist.pop_back_val();
    1022        1776 :     for (auto &SI : SU->Succs) {
    1023         920 :       SUnit *SuccSU = SI.getSUnit();
    1024         920 :       if (SI.getKind() == SDep::Order) {
    1025         515 :         if (Visited.count(SuccSU))
    1026           0 :           continue;
    1027         515 :         if (SuccSU == SUb)
    1028         257 :           return true;
    1029         258 :         Worklist.push_back(SuccSU);
    1030         258 :         Visited.insert(SuccSU);
    1031             :       }
    1032             :     }
    1033             :   }
    1034             :   return false;
    1035             : }
    1036             : 
    1037             : /// Return true if the instruction causes a chain between memory
    1038             : /// references before and after it.
    1039        2066 : static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
    1040        4130 :   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
    1041        2073 :          (MI.hasOrderedMemoryRef() &&
    1042          18 :           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
    1043             : }
    1044             : 
    1045             : /// Return the underlying objects for the memory references of an instruction.
    1046             : /// This function calls the code in ValueTracking, but first checks that the
    1047             : /// instruction has a memory operand.
    1048         504 : static void getUnderlyingObjects(MachineInstr *MI,
    1049             :                                  SmallVectorImpl<Value *> &Objs,
    1050             :                                  const DataLayout &DL) {
    1051         504 :   if (!MI->hasOneMemOperand())
    1052             :     return;
    1053         468 :   MachineMemOperand *MM = *MI->memoperands_begin();
    1054         460 :   if (!MM->getValue())
    1055             :     return;
    1056         460 :   GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
    1057         768 :   for (Value *V : Objs) {
    1058         460 :     if (!isIdentifiedObject(V)) {
    1059             :       Objs.clear();
    1060         306 :       return;
    1061             :     }
    1062         154 :     Objs.push_back(V);
    1063             :   }
    1064             : }
    1065             : 
    1066             : /// Add a chain edge between a load and store if the store can be an
    1067             : /// alias of the load on a subsequent iteration, i.e., a loop carried
    1068             : /// dependence. This code is very similar to the code in ScheduleDAGInstrs
    1069             : /// but that code doesn't create loop carried dependences.
    1070         193 : void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
    1071             :   MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
    1072             :   Value *UnknownValue =
    1073         193 :     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
    1074        2259 :   for (auto &SU : SUnits) {
    1075        2066 :     MachineInstr &MI = *SU.getInstr();
    1076        2066 :     if (isDependenceBarrier(MI, AA))
    1077             :       PendingLoads.clear();
    1078        2055 :     else if (MI.mayLoad()) {
    1079             :       SmallVector<Value *, 4> Objs;
    1080         334 :       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
    1081         334 :       if (Objs.empty())
    1082         234 :         Objs.push_back(UnknownValue);
    1083        1202 :       for (auto V : Objs) {
    1084         434 :         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
    1085         434 :         SUs.push_back(&SU);
    1086             :       }
    1087        1721 :     } else if (MI.mayStore()) {
    1088             :       SmallVector<Value *, 4> Objs;
    1089         170 :       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
    1090         170 :       if (Objs.empty())
    1091         116 :         Objs.push_back(UnknownValue);
    1092         618 :       for (auto V : Objs) {
    1093             :         MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
    1094         224 :             PendingLoads.find(V);
    1095         224 :         if (I == PendingLoads.end())
    1096             :           continue;
    1097         849 :         for (auto Load : I->second) {
    1098         368 :           if (isSuccOrder(Load, &SU))
    1099         538 :             continue;
    1100         111 :           MachineInstr &LdMI = *Load->getInstr();
    1101             :           // First, perform the cheaper check that compares the base register.
    1102             :           // If they are the same and the load offset is less than the store
    1103             :           // offset, then mark the dependence as loop carried potentially.
    1104             :           unsigned BaseReg1, BaseReg2;
    1105             :           int64_t Offset1, Offset2;
    1106         219 :           if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
    1107         108 :               TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
    1108         105 :             if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
    1109             :               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
    1110             :                      "What happened to the chain edge?");
    1111             :               SDep Dep(Load, SDep::Barrier);
    1112             :               Dep.setLatency(1);
    1113          24 :               SU.addPred(Dep);
    1114          24 :               continue;
    1115             :             }
    1116             :           }
    1117             :           // Second, the more expensive check that uses alias analysis on the
    1118             :           // base registers. If they alias, and the load offset is less than
    1119             :           // the store offset, the mark the dependence as loop carried.
    1120          87 :           if (!AA) {
    1121             :             SDep Dep(Load, SDep::Barrier);
    1122             :             Dep.setLatency(1);
    1123           0 :             SU.addPred(Dep);
    1124           0 :             continue;
    1125             :           }
    1126          87 :           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
    1127          87 :           MachineMemOperand *MMO2 = *MI.memoperands_begin();
    1128         174 :           if (!MMO1->getValue() || !MMO2->getValue()) {
    1129             :             SDep Dep(Load, SDep::Barrier);
    1130             :             Dep.setLatency(1);
    1131           0 :             SU.addPred(Dep);
    1132           0 :             continue;
    1133             :           }
    1134         106 :           if (MMO1->getValue() == MMO2->getValue() &&
    1135          19 :               MMO1->getOffset() <= MMO2->getOffset()) {
    1136             :             SDep Dep(Load, SDep::Barrier);
    1137             :             Dep.setLatency(1);
    1138           0 :             SU.addPred(Dep);
    1139           0 :             continue;
    1140             :           }
    1141             :           AliasResult AAResult = AA->alias(
    1142          87 :               MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
    1143             :                              MMO1->getAAInfo()),
    1144          87 :               MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
    1145          87 :                              MMO2->getAAInfo()));
    1146             : 
    1147          87 :           if (AAResult != NoAlias) {
    1148             :             SDep Dep(Load, SDep::Barrier);
    1149             :             Dep.setLatency(1);
    1150          19 :             SU.addPred(Dep);
    1151             :           }
    1152             :         }
    1153             :       }
    1154             :     }
    1155             :   }
    1156         193 : }
    1157             : 
    1158             : /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
    1159             : /// processes dependences for PHIs. This function adds true dependences
    1160             : /// from a PHI to a use, and a loop carried dependence from the use to the
    1161             : /// PHI. The loop carried dependence is represented as an anti dependence
    1162             : /// edge. This function also removes chain dependences between unrelated
    1163             : /// PHIs.
    1164         193 : void SwingSchedulerDAG::updatePhiDependences() {
    1165             :   SmallVector<SDep, 4> RemoveDeps;
    1166         193 :   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
    1167             : 
    1168             :   // Iterate over each DAG node.
    1169        2259 :   for (SUnit &I : SUnits) {
    1170             :     RemoveDeps.clear();
    1171             :     // Set to true if the instruction has an operand defined by a Phi.
    1172             :     unsigned HasPhiUse = 0;
    1173             :     unsigned HasPhiDef = 0;
    1174        2066 :     MachineInstr *MI = I.getInstr();
    1175             :     // Iterate over each operand, and we process the definitions.
    1176        9882 :     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
    1177        2066 :                                     MOE = MI->operands_end();
    1178        9882 :          MOI != MOE; ++MOI) {
    1179        7816 :       if (!MOI->isReg())
    1180        1975 :         continue;
    1181        5841 :       unsigned Reg = MOI->getReg();
    1182        5841 :       if (MOI->isDef()) {
    1183             :         // If the register is used by a Phi, then create an anti dependence.
    1184        2817 :         for (MachineRegisterInfo::use_instr_iterator
    1185        2085 :                  UI = MRI.use_instr_begin(Reg),
    1186             :                  UE = MRI.use_instr_end();
    1187        4902 :              UI != UE; ++UI) {
    1188             :           MachineInstr *UseMI = &*UI;
    1189             :           SUnit *SU = getSUnit(UseMI);
    1190        2746 :           if (SU != nullptr && UseMI->isPHI()) {
    1191             :             if (!MI->isPHI()) {
    1192             :               SDep Dep(SU, SDep::Anti, Reg);
    1193             :               Dep.setLatency(1);
    1194         400 :               I.addPred(Dep);
    1195             :             } else {
    1196             :               HasPhiDef = Reg;
    1197             :               // Add a chain edge to a dependent Phi that isn't an existing
    1198             :               // predecessor.
    1199          40 :               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
    1200          14 :                 I.addPred(SDep(SU, SDep::Barrier));
    1201             :             }
    1202             :           }
    1203             :         }
    1204             :       } else if (MOI->isUse()) {
    1205             :         // If the register is defined by a Phi, then create a true dependence.
    1206        3756 :         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
    1207        3756 :         if (DefMI == nullptr)
    1208          11 :           continue;
    1209             :         SUnit *SU = getSUnit(DefMI);
    1210        2769 :         if (SU != nullptr && DefMI->isPHI()) {
    1211             :           if (!MI->isPHI()) {
    1212             :             SDep Dep(SU, SDep::Data, Reg);
    1213             :             Dep.setLatency(0);
    1214         771 :             ST.adjustSchedDependency(SU, &I, Dep);
    1215         771 :             I.addPred(Dep);
    1216             :           } else {
    1217             :             HasPhiUse = Reg;
    1218             :             // Add a chain edge to a dependent Phi that isn't an existing
    1219             :             // predecessor.
    1220          38 :             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
    1221           0 :               I.addPred(SDep(SU, SDep::Barrier));
    1222             :           }
    1223             :         }
    1224             :       }
    1225             :     }
    1226             :     // Remove order dependences from an unrelated Phi.
    1227        2066 :     if (!SwpPruneDeps)
    1228           0 :       continue;
    1229        8038 :     for (auto &PI : I.Preds) {
    1230        2986 :       MachineInstr *PMI = PI.getSUnit()->getInstr();
    1231        1196 :       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
    1232          14 :         if (I.getInstr()->isPHI()) {
    1233          14 :           if (PMI->getOperand(0).getReg() == HasPhiUse)
    1234           0 :             continue;
    1235          28 :           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
    1236          14 :             continue;
    1237             :         }
    1238           0 :         RemoveDeps.push_back(PI);
    1239             :       }
    1240             :     }
    1241        2066 :     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
    1242           0 :       I.removePred(RemoveDeps[i]);
    1243             :   }
    1244         193 : }
    1245             : 
    1246             : /// Iterate over each DAG node and see if we can change any dependences
    1247             : /// in order to reduce the recurrence MII.
    1248         193 : void SwingSchedulerDAG::changeDependences() {
    1249             :   // See if an instruction can use a value from the previous iteration.
    1250             :   // If so, we update the base and offset of the instruction and change
    1251             :   // the dependences.
    1252        2259 :   for (SUnit &I : SUnits) {
    1253        2066 :     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
    1254        2066 :     int64_t NewOffset = 0;
    1255        2066 :     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
    1256             :                                NewOffset))
    1257        4076 :       continue;
    1258             : 
    1259             :     // Get the MI and SUnit for the instruction that defines the original base.
    1260          56 :     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
    1261          28 :     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
    1262          28 :     if (!DefMI)
    1263           0 :       continue;
    1264             :     SUnit *DefSU = getSUnit(DefMI);
    1265          28 :     if (!DefSU)
    1266           0 :       continue;
    1267             :     // Get the MI and SUnit for the instruction that defins the new base.
    1268          28 :     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
    1269          28 :     if (!LastMI)
    1270           0 :       continue;
    1271             :     SUnit *LastSU = getSUnit(LastMI);
    1272          28 :     if (!LastSU)
    1273           0 :       continue;
    1274             : 
    1275          28 :     if (Topo.IsReachable(&I, LastSU))
    1276           0 :       continue;
    1277             : 
    1278             :     // Remove the dependence. The value now depends on a prior iteration.
    1279             :     SmallVector<SDep, 4> Deps;
    1280          86 :     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
    1281             :          ++P)
    1282          29 :       if (P->getSUnit() == DefSU)
    1283          28 :         Deps.push_back(*P);
    1284          56 :     for (int i = 0, e = Deps.size(); i != e; i++) {
    1285          56 :       Topo.RemovePred(&I, Deps[i].getSUnit());
    1286          28 :       I.removePred(Deps[i]);
    1287             :     }
    1288             :     // Remove the chain dependence between the instructions.
    1289             :     Deps.clear();
    1290         248 :     for (auto &P : LastSU->Preds)
    1291         135 :       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
    1292          25 :         Deps.push_back(P);
    1293          53 :     for (int i = 0, e = Deps.size(); i != e; i++) {
    1294          50 :       Topo.RemovePred(LastSU, Deps[i].getSUnit());
    1295          25 :       LastSU->removePred(Deps[i]);
    1296             :     }
    1297             : 
    1298             :     // Add a dependence between the new instruction and the instruction
    1299             :     // that defines the new base.
    1300          28 :     SDep Dep(&I, SDep::Anti, NewBase);
    1301          28 :     LastSU->addPred(Dep);
    1302             : 
    1303             :     // Remember the base and offset information so that we can update the
    1304             :     // instruction during code generation.
    1305          56 :     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
    1306             :   }
    1307         193 : }
    1308             : 
    1309             : namespace {
    1310             : 
    1311             : // FuncUnitSorter - Comparison operator used to sort instructions by
    1312             : // the number of functional unit choices.
    1313       13506 : struct FuncUnitSorter {
    1314             :   const InstrItineraryData *InstrItins;
    1315             :   DenseMap<unsigned, unsigned> Resources;
    1316             : 
    1317         193 :   FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
    1318             : 
    1319             :   // Compute the number of functional unit alternatives needed
    1320             :   // at each stage, and take the minimum value. We prioritize the
    1321             :   // instructions by the least number of choices first.
    1322             :   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
    1323       12204 :     unsigned schedClass = Inst->getDesc().getSchedClass();
    1324             :     unsigned min = UINT_MAX;
    1325       24317 :     for (const InstrStage *IS = InstrItins->beginStage(schedClass),
    1326             :                           *IE = InstrItins->endStage(schedClass);
    1327       30419 :          IS != IE; ++IS) {
    1328       18215 :       unsigned funcUnits = IS->getUnits();
    1329             :       unsigned numAlternatives = countPopulation(funcUnits);
    1330       18215 :       if (numAlternatives < min) {
    1331             :         min = numAlternatives;
    1332       13535 :         F = funcUnits;
    1333             :       }
    1334             :     }
    1335             :     return min;
    1336             :   }
    1337             : 
    1338             :   // Compute the critical resources needed by the instruction. This
    1339             :   // function records the functional units needed by instructions that
    1340             :   // must use only one functional unit. We use this as a tie breaker
    1341             :   // for computing the resource MII. The instrutions that require
    1342             :   // the same, highly used, functional unit have high priority.
    1343        1636 :   void calcCriticalResources(MachineInstr &MI) {
    1344        1636 :     unsigned SchedClass = MI.getDesc().getSchedClass();
    1345        4047 :     for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
    1346             :                           *IE = InstrItins->endStage(SchedClass);
    1347        4047 :          IS != IE; ++IS) {
    1348        2411 :       unsigned FuncUnits = IS->getUnits();
    1349        2411 :       if (countPopulation(FuncUnits) == 1)
    1350        1324 :         Resources[FuncUnits]++;
    1351             :     }
    1352        1636 :   }
    1353             : 
    1354             :   /// Return true if IS1 has less priority than IS2.
    1355        6102 :   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
    1356        6102 :     unsigned F1 = 0, F2 = 0;
    1357        6102 :     unsigned MFUs1 = minFuncUnits(IS1, F1);
    1358             :     unsigned MFUs2 = minFuncUnits(IS2, F2);
    1359        6102 :     if (MFUs1 == 1 && MFUs2 == 1)
    1360        2218 :       return Resources.lookup(F1) < Resources.lookup(F2);
    1361        4993 :     return MFUs1 > MFUs2;
    1362             :   }
    1363             : };
    1364             : 
    1365             : } // end anonymous namespace
    1366             : 
    1367             : /// Calculate the resource constrained minimum initiation interval for the
    1368             : /// specified loop. We use the DFA to model the resources needed for
    1369             : /// each instruction, and we ignore dependences. A different DFA is created
    1370             : /// for each cycle that is required. When adding a new instruction, we attempt
    1371             : /// to add it to each existing DFA, until a legal space is found. If the
    1372             : /// instruction cannot be reserved in an existing DFA, we create a new one.
    1373         193 : unsigned SwingSchedulerDAG::calculateResMII() {
    1374             :   SmallVector<DFAPacketizer *, 8> Resources;
    1375         193 :   MachineBasicBlock *MBB = Loop.getHeader();
    1376         193 :   Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
    1377             : 
    1378             :   // Sort the instructions by the number of available choices for scheduling,
    1379             :   // least to most. Use the number of critical resources as the tie breaker.
    1380             :   FuncUnitSorter FUS =
    1381         193 :       FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
    1382         193 :   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
    1383         193 :                                    E = MBB->getFirstTerminator();
    1384        1829 :        I != E; ++I)
    1385        1636 :     FUS.calcCriticalResources(*I);
    1386             :   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
    1387         193 :       FuncUnitOrder(FUS);
    1388             : 
    1389         193 :   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
    1390         193 :                                    E = MBB->getFirstTerminator();
    1391        1829 :        I != E; ++I)
    1392        1636 :     FuncUnitOrder.push(&*I);
    1393             : 
    1394        1829 :   while (!FuncUnitOrder.empty()) {
    1395        1636 :     MachineInstr *MI = FuncUnitOrder.top();
    1396        1636 :     FuncUnitOrder.pop();
    1397        3272 :     if (TII->isZeroCost(MI->getOpcode()))
    1398          80 :       continue;
    1399             :     // Attempt to reserve the instruction in an existing DFA. At least one
    1400             :     // DFA is needed for each cycle.
    1401        1556 :     unsigned NumCycles = getSUnit(MI)->Latency;
    1402             :     unsigned ReservedCycles = 0;
    1403             :     SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
    1404             :     SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
    1405        4668 :     for (unsigned C = 0; C < NumCycles; ++C)
    1406        6240 :       while (RI != RE) {
    1407        5896 :         if ((*RI++)->canReserveResources(*MI)) {
    1408        1212 :           ++ReservedCycles;
    1409        1212 :           break;
    1410             :         }
    1411             :       }
    1412             :     // Start reserving resources using existing DFAs.
    1413        3980 :     for (unsigned C = 0; C < ReservedCycles; ++C) {
    1414        1212 :       --RI;
    1415        1212 :       (*RI)->reserveResources(*MI);
    1416             :     }
    1417             :     // Add new DFAs, if needed, to reserve resources.
    1418        2244 :     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
    1419             :       DFAPacketizer *NewResource =
    1420         344 :           TII->CreateTargetScheduleState(MF.getSubtarget());
    1421             :       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
    1422         344 :       NewResource->reserveResources(*MI);
    1423         344 :       Resources.push_back(NewResource);
    1424             :     }
    1425             :   }
    1426             :   int Resmii = Resources.size();
    1427             :   // Delete the memory for each of the DFAs that were created earlier.
    1428        1267 :   for (DFAPacketizer *RI : Resources) {
    1429             :     DFAPacketizer *D = RI;
    1430        1074 :     delete D;
    1431             :   }
    1432             :   Resources.clear();
    1433         386 :   return Resmii;
    1434             : }
    1435             : 
    1436             : /// Calculate the recurrence-constrainted minimum initiation interval.
    1437             : /// Iterate over each circuit.  Compute the delay(c) and distance(c)
    1438             : /// for each circuit. The II needs to satisfy the inequality
    1439             : /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
    1440             : /// II that satistifies the inequality, and the RecMII is the maximum
    1441             : /// of those values.
    1442             : unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
    1443             :   unsigned RecMII = 0;
    1444             : 
    1445        1277 :   for (NodeSet &Nodes : NodeSets) {
    1446         542 :     if (Nodes.empty())
    1447             :       continue;
    1448             : 
    1449         542 :     unsigned Delay = Nodes.getLatency();
    1450             :     unsigned Distance = 1;
    1451             : 
    1452             :     // ii = ceil(delay / distance)
    1453             :     unsigned CurMII = (Delay + Distance - 1) / Distance;
    1454             :     Nodes.setRecMII(CurMII);
    1455         542 :     if (CurMII > RecMII)
    1456             :       RecMII = CurMII;
    1457             :   }
    1458             : 
    1459             :   return RecMII;
    1460             : }
    1461             : 
    1462             : /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
    1463             : /// but we do this to find the circuits, and then change them back.
    1464         386 : static void swapAntiDependences(std::vector<SUnit> &SUnits) {
    1465             :   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
    1466        4904 :   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
    1467        4132 :     SUnit *SU = &SUnits[i];
    1468        5922 :     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
    1469       10054 :          IP != EP; ++IP) {
    1470        5922 :       if (IP->getKind() != SDep::Anti)
    1471        5062 :         continue;
    1472         860 :       DepsAdded.push_back(std::make_pair(SU, *IP));
    1473             :     }
    1474             :   }
    1475         860 :   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
    1476             :                                                           E = DepsAdded.end();
    1477        1246 :        I != E; ++I) {
    1478             :     // Remove this anti dependency and add one in the reverse direction.
    1479         860 :     SUnit *SU = I->first;
    1480         860 :     SDep &D = I->second;
    1481             :     SUnit *TargetSU = D.getSUnit();
    1482         860 :     unsigned Reg = D.getReg();
    1483         860 :     unsigned Lat = D.getLatency();
    1484         860 :     SU->removePred(D);
    1485             :     SDep Dep(SU, SDep::Anti, Reg);
    1486             :     Dep.setLatency(Lat);
    1487         860 :     TargetSU->addPred(Dep);
    1488             :   }
    1489         386 : }
    1490             : 
    1491             : /// Create the adjacency structure of the nodes in the graph.
    1492         193 : void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
    1493             :     SwingSchedulerDAG *DAG) {
    1494         386 :   BitVector Added(SUnits.size());
    1495             :   DenseMap<int, int> OutputDeps;
    1496        2452 :   for (int i = 0, e = SUnits.size(); i != e; ++i) {
    1497             :     Added.reset();
    1498             :     // Add any successor to the adjacency matrix and exclude duplicates.
    1499       10054 :     for (auto &SI : SUnits[i].Succs) {
    1500             :       // Only create a back-edge on the first and last nodes of a dependence
    1501             :       // chain. This records any chains and adds them later.
    1502        2961 :       if (SI.getKind() == SDep::Output) {
    1503           0 :         int N = SI.getSUnit()->NodeNum;
    1504           0 :         int BackEdge = i;
    1505           0 :         auto Dep = OutputDeps.find(BackEdge);
    1506           0 :         if (Dep != OutputDeps.end()) {
    1507           0 :           BackEdge = Dep->second;
    1508             :           OutputDeps.erase(Dep);
    1509             :         }
    1510           0 :         OutputDeps[N] = BackEdge;
    1511             :       }
    1512             :       // Do not process a boundary node and a back-edge is processed only
    1513             :       // if it goes to a Phi.
    1514        5952 :       if (SI.getSUnit()->isBoundaryNode() ||
    1515         430 :           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
    1516          30 :         continue;
    1517        2931 :       int N = SI.getSUnit()->NodeNum;
    1518        2931 :       if (!Added.test(N)) {
    1519        2903 :         AdjK[i].push_back(N);
    1520        2903 :         Added.set(N);
    1521             :       }
    1522             :     }
    1523             :     // A chain edge between a store and a load is treated as a back-edge in the
    1524             :     // adjacency matrix.
    1525       10054 :     for (auto &PI : SUnits[i].Preds) {
    1526        9283 :       if (!SUnits[i].getInstr()->mayStore() ||
    1527        1000 :           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
    1528        2861 :         continue;
    1529         200 :       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
    1530          99 :         int N = PI.getSUnit()->NodeNum;
    1531          99 :         if (!Added.test(N)) {
    1532          99 :           AdjK[i].push_back(N);
    1533          99 :           Added.set(N);
    1534             :         }
    1535             :       }
    1536             :     }
    1537             :   }
    1538             :   // Add back-eges in the adjacency matrix for the output dependences.
    1539         386 :   for (auto &OD : OutputDeps)
    1540           0 :     if (!Added.test(OD.second)) {
    1541           0 :       AdjK[OD.first].push_back(OD.second);
    1542           0 :       Added.set(OD.second);
    1543             :     }
    1544         193 : }
    1545             : 
    1546             : /// Identify an elementary circuit in the dependence graph starting at the
    1547             : /// specified node.
    1548       16769 : bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
    1549             :                                           bool HasBackedge) {
    1550       33538 :   SUnit *SV = &SUnits[V];
    1551             :   bool F = false;
    1552       16769 :   Stack.insert(SV);
    1553       16769 :   Blocked.set(V);
    1554             : 
    1555       63593 :   for (auto W : AdjK[V]) {
    1556       24237 :     if (NumPaths > MaxPaths)
    1557             :       break;
    1558       24101 :     if (W < S)
    1559        3078 :       continue;
    1560       21023 :     if (W == S) {
    1561         689 :       if (!HasBackedge)
    1562        1626 :         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
    1563             :       F = true;
    1564         689 :       ++NumPaths;
    1565             :       break;
    1566       40668 :     } else if (!Blocked.test(W)) {
    1567       14703 :       if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
    1568             :         F = true;
    1569             :     }
    1570             :   }
    1571             : 
    1572       16080 :   if (F)
    1573        2146 :     unblock(V);
    1574             :   else {
    1575       54347 :     for (auto W : AdjK[V]) {
    1576       19862 :       if (W < S)
    1577        2782 :         continue;
    1578       34160 :       if (B[W].count(SV) == 0)
    1579       16587 :         B[W].insert(SV);
    1580             :     }
    1581             :   }
    1582             :   Stack.pop_back();
    1583       16769 :   return F;
    1584             : }
    1585             : 
    1586             : /// Unblock a node in the circuit finding algorithm.
    1587        4972 : void SwingSchedulerDAG::Circuits::unblock(int U) {
    1588        4972 :   Blocked.reset(U);
    1589        4972 :   SmallPtrSet<SUnit *, 4> &BU = B[U];
    1590       13936 :   while (!BU.empty()) {
    1591        4482 :     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
    1592             :     assert(SI != BU.end() && "Invalid B set.");
    1593        4482 :     SUnit *W = *SI;
    1594             :     BU.erase(W);
    1595        8964 :     if (Blocked.test(W->NodeNum))
    1596        2826 :       unblock(W->NodeNum);
    1597             :   }
    1598        4972 : }
    1599             : 
    1600             : /// Identify all the elementary circuits in the dependence graph using
    1601             : /// Johnson's circuit algorithm.
    1602         193 : void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
    1603             :   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
    1604             :   // but we do this to find the circuits, and then change them back.
    1605         193 :   swapAntiDependences(SUnits);
    1606             : 
    1607         386 :   Circuits Cir(SUnits);
    1608             :   // Create the adjacency structure.
    1609         193 :   Cir.createAdjacencyStructure(this);
    1610        2452 :   for (int i = 0, e = SUnits.size(); i != e; ++i) {
    1611        2066 :     Cir.reset();
    1612        2066 :     Cir.circuit(i, i, NodeSets);
    1613             :   }
    1614             : 
    1615             :   // Change the dependences back so that we've created a DAG again.
    1616         193 :   swapAntiDependences(SUnits);
    1617         193 : }
    1618             : 
    1619             : /// Return true for DAG nodes that we ignore when computing the cost functions.
    1620             : /// We ignore the back-edge recurrence in order to avoid unbounded recurison
    1621             : /// in the calculation of the ASAP, ALAP, etc functions.
    1622             : static bool ignoreDependence(const SDep &D, bool isPred) {
    1623             :   if (D.isArtificial())
    1624             :     return true;
    1625       13890 :   return D.getKind() == SDep::Anti && isPred;
    1626             : }
    1627             : 
    1628             : /// Compute several functions need to order the nodes for scheduling.
    1629             : ///  ASAP - Earliest time to schedule a node.
    1630             : ///  ALAP - Latest time to schedule a node.
    1631             : ///  MOV - Mobility function, difference between ALAP and ASAP.
    1632             : ///  D - Depth of each node.
    1633             : ///  H - Height of each node.
    1634         191 : void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
    1635         382 :   ScheduleInfo.resize(SUnits.size());
    1636             : 
    1637             :   LLVM_DEBUG({
    1638             :     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
    1639             :                                                     E = Topo.end();
    1640             :          I != E; ++I) {
    1641             :       SUnit *SU = &SUnits[*I];
    1642             :       SU->dump(this);
    1643             :     }
    1644             :   });
    1645             : 
    1646         191 :   int maxASAP = 0;
    1647             :   // Compute ASAP and ZeroLatencyDepth.
    1648             :   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
    1649             :                                                   E = Topo.end();
    1650        2091 :        I != E; ++I) {
    1651        1900 :     int asap = 0;
    1652        1900 :     int zeroLatencyDepth = 0;
    1653        1900 :     SUnit *SU = &SUnits[*I];
    1654        2696 :     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
    1655             :                                     EP = SU->Preds.end();
    1656        4596 :          IP != EP; ++IP) {
    1657             :       SUnit *pred = IP->getSUnit();
    1658        2696 :       if (IP->getLatency() == 0)
    1659        1128 :         zeroLatencyDepth =
    1660        4512 :             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
    1661         425 :       if (ignoreDependence(*IP, true))
    1662         425 :         continue;
    1663        6813 :       asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
    1664             :                                   getDistance(pred, SU, *IP) * MII));
    1665             :     }
    1666        1900 :     maxASAP = std::max(maxASAP, asap);
    1667        3800 :     ScheduleInfo[*I].ASAP = asap;
    1668        3800 :     ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
    1669             :   }
    1670             : 
    1671             :   // Compute ALAP, ZeroLatencyHeight, and MOV.
    1672             :   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
    1673             :                                                           E = Topo.rend();
    1674        2091 :        I != E; ++I) {
    1675        1900 :     int alap = maxASAP;
    1676        1900 :     int zeroLatencyHeight = 0;
    1677        1900 :     SUnit *SU = &SUnits[*I];
    1678        2696 :     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
    1679             :                                     ES = SU->Succs.end();
    1680        4596 :          IS != ES; ++IS) {
    1681             :       SUnit *succ = IS->getSUnit();
    1682        2696 :       if (IS->getLatency() == 0)
    1683        1128 :         zeroLatencyHeight =
    1684        4512 :             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
    1685         425 :       if (ignoreDependence(*IS, true))
    1686         425 :         continue;
    1687        6813 :       alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
    1688             :                                   getDistance(SU, succ, *IS) * MII));
    1689             :     }
    1690             : 
    1691        3800 :     ScheduleInfo[*I].ALAP = alap;
    1692        3800 :     ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
    1693             :   }
    1694             : 
    1695             :   // After computing the node functions, compute the summary for each node set.
    1696        1023 :   for (NodeSet &I : NodeSets)
    1697         416 :     I.computeNodeSetInfo(this);
    1698             : 
    1699             :   LLVM_DEBUG({
    1700             :     for (unsigned i = 0; i < SUnits.size(); i++) {
    1701             :       dbgs() << "\tNode " << i << ":\n";
    1702             :       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
    1703             :       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
    1704             :       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
    1705             :       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
    1706             :       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
    1707             :       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
    1708             :       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
    1709             :     }
    1710             :   });
    1711         191 : }
    1712             : 
    1713             : /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
    1714             : /// as the predecessors of the elements of NodeOrder that are not also in
    1715             : /// NodeOrder.
    1716         856 : static bool pred_L(SetVector<SUnit *> &NodeOrder,
    1717             :                    SmallSetVector<SUnit *, 8> &Preds,
    1718             :                    const NodeSet *S = nullptr) {
    1719         856 :   Preds.clear();
    1720             :   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
    1721        7536 :        I != E; ++I) {
    1722       18123 :     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
    1723       18123 :          PI != PE; ++PI) {
    1724       14388 :       if (S && S->count(PI->getSUnit()) == 0)
    1725        2945 :         continue;
    1726        1349 :       if (ignoreDependence(*PI, true))
    1727        1349 :         continue;
    1728        7149 :       if (NodeOrder.count(PI->getSUnit()) == 0)
    1729         941 :         Preds.insert(PI->getSUnit());
    1730             :     }
    1731             :     // Back-edges are predecessors with an anti-dependence.
    1732       16910 :     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
    1733             :                                     ES = (*I)->Succs.end();
    1734       16910 :          IS != ES; ++IS) {
    1735       10230 :       if (IS->getKind() != SDep::Anti)
    1736        8864 :         continue;
    1737        1688 :       if (S && S->count(IS->getSUnit()) == 0)
    1738         322 :         continue;
    1739        1044 :       if (NodeOrder.count(IS->getSUnit()) == 0)
    1740           0 :         Preds.insert(IS->getSUnit());
    1741             :     }
    1742             :   }
    1743         856 :   return !Preds.empty();
    1744             : }
    1745             : 
    1746             : /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
    1747             : /// as the successors of the elements of NodeOrder that are not also in
    1748             : /// NodeOrder.
    1749        2489 : static bool succ_L(SetVector<SUnit *> &NodeOrder,
    1750             :                    SmallSetVector<SUnit *, 8> &Succs,
    1751             :                    const NodeSet *S = nullptr) {
    1752        2489 :   Succs.clear();
    1753             :   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
    1754       14284 :        I != E; ++I) {
    1755       30588 :     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
    1756       30588 :          SI != SE; ++SI) {
    1757       21812 :       if (S && S->count(SI->getSUnit()) == 0)
    1758        3019 :         continue;
    1759           0 :       if (ignoreDependence(*SI, false))
    1760           0 :         continue;
    1761       15774 :       if (NodeOrder.count(SI->getSUnit()) == 0)
    1762        2296 :         Succs.insert(SI->getSUnit());
    1763             :     }
    1764       33305 :     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
    1765             :                                     PE = (*I)->Preds.end();
    1766       33305 :          PI != PE; ++PI) {
    1767       21510 :       if (PI->getKind() != SDep::Anti)
    1768       17791 :         continue;
    1769        4193 :       if (S && S->count(PI->getSUnit()) == 0)
    1770         474 :         continue;
    1771        3245 :       if (NodeOrder.count(PI->getSUnit()) == 0)
    1772         850 :         Succs.insert(PI->getSUnit());
    1773             :     }
    1774             :   }
    1775        2489 :   return !Succs.empty();
    1776             : }
    1777             : 
    1778             : /// Return true if there is a path from the specified node to any of the nodes
    1779             : /// in DestNodes. Keep track and return the nodes in any path.
    1780        5474 : static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
    1781             :                         SetVector<SUnit *> &DestNodes,
    1782             :                         SetVector<SUnit *> &Exclude,
    1783             :                         SmallPtrSet<SUnit *, 8> &Visited) {
    1784        5474 :   if (Cur->isBoundaryNode())
    1785             :     return false;
    1786        5474 :   if (Exclude.count(Cur) != 0)
    1787             :     return false;
    1788        9646 :   if (DestNodes.count(Cur) != 0)
    1789             :     return true;
    1790        4328 :   if (!Visited.insert(Cur).second)
    1791        2018 :     return Path.count(Cur) != 0;
    1792             :   bool FoundPath = false;
    1793       14816 :   for (auto &SI : Cur->Succs)
    1794        4089 :     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
    1795       17698 :   for (auto &PI : Cur->Preds)
    1796        5530 :     if (PI.getKind() == SDep::Anti)
    1797         325 :       FoundPath |=
    1798         325 :           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
    1799        3319 :   if (FoundPath)
    1800         740 :     Path.insert(Cur);
    1801             :   return FoundPath;
    1802             : }
    1803             : 
    1804             : /// Return true if Set1 is a subset of Set2.
    1805         591 : template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
    1806        1511 :   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
    1807        1033 :     if (Set2.count(*I) == 0)
    1808             :       return false;
    1809             :   return true;
    1810             : }
    1811             : 
    1812             : /// Compute the live-out registers for the instructions in a node-set.
    1813             : /// The live-out registers are those that are defined in the node-set,
    1814             : /// but not used. Except for use operands of Phis.
    1815          65 : static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
    1816             :                             NodeSet &NS) {
    1817          65 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    1818          65 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    1819             :   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
    1820          65 :   SmallSet<unsigned, 4> Uses;
    1821         503 :   for (SUnit *SU : NS) {
    1822         438 :     const MachineInstr *MI = SU->getInstr();
    1823          15 :     if (MI->isPHI())
    1824          15 :       continue;
    1825        3583 :     for (const MachineOperand &MO : MI->operands())
    1826        2891 :       if (MO.isReg() && MO.isUse()) {
    1827         881 :         unsigned Reg = MO.getReg();
    1828         881 :         if (TargetRegisterInfo::isVirtualRegister(Reg))
    1829         871 :           Uses.insert(Reg);
    1830          10 :         else if (MRI.isAllocatable(Reg))
    1831           0 :           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
    1832           0 :             Uses.insert(*Units);
    1833             :       }
    1834             :   }
    1835         503 :   for (SUnit *SU : NS)
    1836        4186 :     for (const MachineOperand &MO : SU->getInstr()->operands())
    1837        3456 :       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
    1838         437 :         unsigned Reg = MO.getReg();
    1839         437 :         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
    1840         437 :           if (!Uses.count(Reg))
    1841          85 :             LiveOutRegs.push_back(RegisterMaskPair(Reg,
    1842             :                                                    LaneBitmask::getNone()));
    1843           0 :         } else if (MRI.isAllocatable(Reg)) {
    1844           0 :           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
    1845           0 :             if (!Uses.count(*Units))
    1846           0 :               LiveOutRegs.push_back(RegisterMaskPair(*Units,
    1847             :                                                      LaneBitmask::getNone()));
    1848             :         }
    1849             :       }
    1850          65 :   RPTracker.addLiveRegs(LiveOutRegs);
    1851          65 : }
    1852             : 
    1853             : /// A heuristic to filter nodes in recurrent node-sets if the register
    1854             : /// pressure of a set is too high.
    1855         191 : void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
    1856        1023 :   for (auto &NS : NodeSets) {
    1857             :     // Skip small node-sets since they won't cause register pressure problems.
    1858         416 :     if (NS.size() <= 2)
    1859         351 :       continue;
    1860             :     IntervalPressure RecRegPressure;
    1861          65 :     RegPressureTracker RecRPTracker(RecRegPressure);
    1862         130 :     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
    1863          65 :     computeLiveOuts(MF, RecRPTracker, NS);
    1864          65 :     RecRPTracker.closeBottom();
    1865             : 
    1866             :     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
    1867             :     llvm::sort(SUnits.begin(), SUnits.end(),
    1868             :                [](const SUnit *A, const SUnit *B) {
    1869             :       return A->NodeNum > B->NodeNum;
    1870             :     });
    1871             : 
    1872         494 :     for (auto &SU : SUnits) {
    1873             :       // Since we're computing the register pressure for a subset of the
    1874             :       // instructions in a block, we need to set the tracker for each
    1875             :       // instruction in the node-set. The tracker is set to the instruction
    1876             :       // just after the one we're interested in.
    1877         431 :       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
    1878             :       RecRPTracker.setPos(std::next(CurInstI));
    1879             : 
    1880         431 :       RegPressureDelta RPDelta;
    1881         431 :       ArrayRef<PressureChange> CriticalPSets;
    1882         431 :       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
    1883             :                                              CriticalPSets,
    1884             :                                              RecRegPressure.MaxSetPressure);
    1885         431 :       if (RPDelta.Excess.isValid()) {
    1886             :         LLVM_DEBUG(
    1887             :             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
    1888             :                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
    1889             :                    << ":" << RPDelta.Excess.getUnitInc());
    1890           2 :         NS.setExceedPressure(SU);
    1891           2 :         break;
    1892             :       }
    1893         429 :       RecRPTracker.recede();
    1894             :     }
    1895             :   }
    1896         191 : }
    1897             : 
    1898             : /// A heuristic to colocate node sets that have the same set of
    1899             : /// successors.
    1900         191 : void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
    1901             :   unsigned Colocate = 0;
    1902         607 :   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
    1903         416 :     NodeSet &N1 = NodeSets[i];
    1904             :     SmallSetVector<SUnit *, 8> S1;
    1905         832 :     if (N1.empty() || !succ_L(N1, S1))
    1906             :       continue;
    1907         757 :     for (int j = i + 1; j < e; ++j) {
    1908         465 :       NodeSet &N2 = NodeSets[j];
    1909         465 :       if (N1.compareRecMII(N2) != 0)
    1910         259 :         continue;
    1911             :       SmallSetVector<SUnit *, 8> S2;
    1912         604 :       if (N2.empty() || !succ_L(N2, S2))
    1913             :         continue;
    1914         245 :       if (isSubset(S1, S2) && S1.size() == S2.size()) {
    1915          26 :         N1.setColocate(++Colocate);
    1916             :         N2.setColocate(Colocate);
    1917             :         break;
    1918             :       }
    1919             :     }
    1920             :   }
    1921         191 : }
    1922             : 
    1923             : /// Check if the existing node-sets are profitable. If not, then ignore the
    1924             : /// recurrent node-sets, and attempt to schedule all nodes together. This is
    1925             : /// a heuristic. If the MII is large and all the recurrent node-sets are small,
    1926             : /// then it's best to try to schedule all instructions together instead of
    1927             : /// starting with the recurrent node-sets.
    1928         191 : void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
    1929             :   // Look for loops with a large MII.
    1930         191 :   if (MII < 17)
    1931             :     return;
    1932             :   // Check if the node-set contains only a simple add recurrence.
    1933           1 :   for (auto &NS : NodeSets) {
    1934           2 :     if (NS.getRecMII() > 2)
    1935             :       return;
    1936           0 :     if (NS.getMaxDepth() > MII)
    1937             :       return;
    1938             :   }
    1939             :   NodeSets.clear();
    1940             :   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
    1941             :   return;
    1942             : }
    1943             : 
    1944             : /// Add the nodes that do not belong to a recurrence set into groups
    1945             : /// based upon connected componenets.
    1946         191 : void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
    1947         191 :   SetVector<SUnit *> NodesAdded;
    1948             :   SmallPtrSet<SUnit *, 8> Visited;
    1949             :   // Add the nodes that are on a path between the previous node sets and
    1950             :   // the current node set.
    1951        1023 :   for (NodeSet &I : NodeSets) {
    1952             :     SmallSetVector<SUnit *, 8> N;
    1953             :     // Add the nodes from the current node set to the previous node set.
    1954         416 :     if (succ_L(I, N)) {
    1955         318 :       SetVector<SUnit *> Path;
    1956        1568 :       for (SUnit *NI : N) {
    1957         625 :         Visited.clear();
    1958         625 :         computePath(NI, Path, NodesAdded, I, Visited);
    1959             :       }
    1960         318 :       if (!Path.empty())
    1961             :         I.insert(Path.begin(), Path.end());
    1962             :     }
    1963             :     // Add the nodes from the previous node set to the current node set.
    1964             :     N.clear();
    1965         416 :     if (succ_L(NodesAdded, N)) {
    1966         167 :       SetVector<SUnit *> Path;
    1967        1037 :       for (SUnit *NI : N) {
    1968         435 :         Visited.clear();
    1969         435 :         computePath(NI, Path, I, NodesAdded, Visited);
    1970             :       }
    1971         167 :       if (!Path.empty())
    1972             :         I.insert(Path.begin(), Path.end());
    1973             :     }
    1974         416 :     NodesAdded.insert(I.begin(), I.end());
    1975             :   }
    1976             : 
    1977             :   // Create a new node set with the connected nodes of any successor of a node
    1978             :   // in a recurrent set.
    1979             :   NodeSet NewSet;
    1980             :   SmallSetVector<SUnit *, 8> N;
    1981         191 :   if (succ_L(NodesAdded, N))
    1982         394 :     for (SUnit *I : N)
    1983         157 :       addConnectedNodes(I, NewSet, NodesAdded);
    1984         191 :   if (!NewSet.empty())
    1985          80 :     NodeSets.push_back(NewSet);
    1986             : 
    1987             :   // Create a new node set with the connected nodes of any predecessor of a node
    1988             :   // in a recurrent set.
    1989             :   NewSet.clear();
    1990         191 :   if (pred_L(NodesAdded, N))
    1991          90 :     for (SUnit *I : N)
    1992          36 :       addConnectedNodes(I, NewSet, NodesAdded);
    1993         191 :   if (!NewSet.empty())
    1994          18 :     NodeSets.push_back(NewSet);
    1995             : 
    1996             :   // Create new nodes sets with the connected nodes any remaining node that
    1997             :   // has no predecessor.
    1998        6082 :   for (unsigned i = 0; i < SUnits.size(); ++i) {
    1999             :     SUnit *SU = &SUnits[i];
    2000        1900 :     if (NodesAdded.count(SU) == 0) {
    2001             :       NewSet.clear();
    2002          10 :       addConnectedNodes(SU, NewSet, NodesAdded);
    2003          10 :       if (!NewSet.empty())
    2004          10 :         NodeSets.push_back(NewSet);
    2005             :     }
    2006             :   }
    2007         191 : }
    2008             : 
    2009             : /// Add the node to the set, and add all is its connected nodes to the set.
    2010         597 : void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
    2011             :                                           SetVector<SUnit *> &NodesAdded) {
    2012         597 :   NewSet.insert(SU);
    2013         597 :   NodesAdded.insert(SU);
    2014        2618 :   for (auto &SI : SU->Succs) {
    2015             :     SUnit *Successor = SI.getSUnit();
    2016         712 :     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
    2017         214 :       addConnectedNodes(Successor, NewSet, NodesAdded);
    2018             :   }
    2019        2734 :   for (auto &PI : SU->Preds) {
    2020             :     SUnit *Predecessor = PI.getSUnit();
    2021         770 :     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
    2022         180 :       addConnectedNodes(Predecessor, NewSet, NodesAdded);
    2023             :   }
    2024         597 : }
    2025             : 
    2026             : /// Return true if Set1 contains elements in Set2. The elements in common
    2027             : /// are returned in a different container.
    2028         304 : static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
    2029             :                         SmallSetVector<SUnit *, 8> &Result) {
    2030         304 :   Result.clear();
    2031         624 :   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
    2032         640 :     SUnit *SU = Set1[i];
    2033             :     if (Set2.count(SU) != 0)
    2034          60 :       Result.insert(SU);
    2035             :   }
    2036         304 :   return !Result.empty();
    2037             : }
    2038             : 
    2039             : /// Merge the recurrence node sets that have the same initial node.
    2040         193 : void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
    2041         614 :   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
    2042             :        ++I) {
    2043             :     NodeSet &NI = *I;
    2044         421 :     for (NodeSetType::iterator J = I + 1; J != E;) {
    2045             :       NodeSet &NJ = *J;
    2046        2640 :       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
    2047         242 :         if (NJ.compareRecMII(NI) > 0)
    2048             :           NI.setRecMII(NJ.getRecMII());
    2049         783 :         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
    2050             :              ++NII)
    2051         662 :           I->insert(*NII);
    2052         121 :         NodeSets.erase(J);
    2053             :         E = NodeSets.end();
    2054             :       } else {
    2055        1199 :         ++J;
    2056             :       }
    2057             :     }
    2058             :   }
    2059         193 : }
    2060             : 
    2061             : /// Remove nodes that have been scheduled in previous NodeSets.
    2062         191 : void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
    2063        1394 :   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
    2064             :        ++I)
    2065         506 :     for (NodeSetType::iterator J = I + 1; J != E;) {
    2066        2915 :       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
    2067             : 
    2068         733 :       if (J->empty()) {
    2069          18 :         NodeSets.erase(J);
    2070             :         E = NodeSets.end();
    2071             :       } else {
    2072         715 :         ++J;
    2073             :       }
    2074             :     }
    2075         191 : }
    2076             : 
    2077             : /// Compute an ordered list of the dependence graph nodes, which
    2078             : /// indicates the order that the nodes will be scheduled.  This is a
    2079             : /// two-level algorithm. First, a partial order is created, which
    2080             : /// consists of a list of sets ordered from highest to lowest priority.
    2081         191 : void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
    2082             :   SmallSetVector<SUnit *, 8> R;
    2083         191 :   NodeOrder.clear();
    2084             : 
    2085        1203 :   for (auto &Nodes : NodeSets) {
    2086             :     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
    2087             :     OrderKind Order;
    2088             :     SmallSetVector<SUnit *, 8> N;
    2089         506 :     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
    2090          92 :       R.insert(N.begin(), N.end());
    2091             :       Order = BottomUp;
    2092             :       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
    2093         414 :     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
    2094         110 :       R.insert(N.begin(), N.end());
    2095             :       Order = TopDown;
    2096             :       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
    2097         304 :     } else if (isIntersect(N, Nodes, R)) {
    2098             :       // If some of the successors are in the existing node-set, then use the
    2099             :       // top-down ordering.
    2100             :       Order = TopDown;
    2101             :       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
    2102         266 :     } else if (NodeSets.size() == 1) {
    2103         180 :       for (auto &N : Nodes)
    2104         340 :         if (N->Succs.size() == 0)
    2105          13 :           R.insert(N);
    2106             :       Order = BottomUp;
    2107             :       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
    2108             :     } else {
    2109             :       // Find the node with the highest ASAP.
    2110         256 :       SUnit *maxASAP = nullptr;
    2111         928 :       for (SUnit *SU : Nodes) {
    2112        1504 :         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
    2113         217 :             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
    2114         620 :           maxASAP = SU;
    2115             :       }
    2116         256 :       R.insert(maxASAP);
    2117             :       Order = BottomUp;
    2118             :       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
    2119             :     }
    2120             : 
    2121        1031 :     while (!R.empty()) {
    2122         525 :       if (Order == TopDown) {
    2123             :         // Choose the node with the maximum height.  If more than one, choose
    2124             :         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
    2125             :         // choose the node with the lowest MOV.
    2126        1449 :         while (!R.empty()) {
    2127         645 :           SUnit *maxHeight = nullptr;
    2128        4269 :           for (SUnit *I : R) {
    2129        4146 :             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
    2130         845 :               maxHeight = I;
    2131        2506 :             else if (getHeight(I) == getHeight(maxHeight) &&
    2132        1144 :                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
    2133          33 :               maxHeight = I;
    2134        1473 :             else if (getHeight(I) == getHeight(maxHeight) &&
    2135         539 :                      getZeroLatencyHeight(I) ==
    2136        1980 :                          getZeroLatencyHeight(maxHeight) &&
    2137             :                      getMOV(I) < getMOV(maxHeight))
    2138          26 :               maxHeight = I;
    2139             :           }
    2140         645 :           NodeOrder.insert(maxHeight);
    2141             :           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
    2142         645 :           R.remove(maxHeight);
    2143        2850 :           for (const auto &I : maxHeight->Succs) {
    2144         152 :             if (Nodes.count(I.getSUnit()) == 0)
    2145         152 :               continue;
    2146         628 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2147         122 :               continue;
    2148           0 :             if (ignoreDependence(I, false))
    2149           0 :               continue;
    2150         506 :             R.insert(I.getSUnit());
    2151             :           }
    2152             :           // Back-edges are predecessors with an anti-dependence.
    2153        3258 :           for (const auto &I : maxHeight->Preds) {
    2154         984 :             if (I.getKind() != SDep::Anti)
    2155         909 :               continue;
    2156           7 :             if (Nodes.count(I.getSUnit()) == 0)
    2157           7 :               continue;
    2158          68 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2159           3 :               continue;
    2160          65 :             R.insert(I.getSUnit());
    2161             :           }
    2162             :         }
    2163             :         Order = BottomUp;
    2164             :         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
    2165             :         SmallSetVector<SUnit *, 8> N;
    2166         159 :         if (pred_L(NodeOrder, N, &Nodes))
    2167           8 :           R.insert(N.begin(), N.end());
    2168             :       } else {
    2169             :         // Choose the node with the maximum depth.  If more than one, choose
    2170             :         // the node with the maximum ZeroLatencyDepth. If still more than one,
    2171             :         // choose the node with the lowest MOV.
    2172        2874 :         while (!R.empty()) {
    2173        1255 :           SUnit *maxDepth = nullptr;
    2174        7267 :           for (SUnit *I : R) {
    2175        6508 :             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
    2176        1595 :               maxDepth = I;
    2177        3603 :             else if (getDepth(I) == getDepth(maxDepth) &&
    2178        1562 :                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
    2179          44 :               maxDepth = I;
    2180        2104 :             else if (getDepth(I) == getDepth(maxDepth) &&
    2181        2723 :                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
    2182             :                      getMOV(I) < getMOV(maxDepth))
    2183         170 :               maxDepth = I;
    2184             :           }
    2185        1255 :           NodeOrder.insert(maxDepth);
    2186             :           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
    2187        1255 :           R.remove(maxDepth);
    2188        1255 :           if (Nodes.isExceedSU(maxDepth)) {
    2189             :             Order = TopDown;
    2190             :             R.clear();
    2191           1 :             R.insert(Nodes.getNode(0));
    2192           1 :             break;
    2193             :           }
    2194        4674 :           for (const auto &I : maxDepth->Preds) {
    2195         396 :             if (Nodes.count(I.getSUnit()) == 0)
    2196         396 :               continue;
    2197        1314 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2198          78 :               continue;
    2199        1236 :             R.insert(I.getSUnit());
    2200             :           }
    2201             :           // Back-edges are predecessors with an anti-dependence.
    2202        6338 :           for (const auto &I : maxDepth->Succs) {
    2203        1915 :             if (I.getKind() != SDep::Anti)
    2204        1613 :               continue;
    2205          38 :             if (Nodes.count(I.getSUnit()) == 0)
    2206          38 :               continue;
    2207         264 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2208         222 :               continue;
    2209          42 :             R.insert(I.getSUnit());
    2210             :           }
    2211             :         }
    2212             :         Order = TopDown;
    2213             :         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
    2214             :         SmallSetVector<SUnit *, 8> N;
    2215         366 :         if (succ_L(NodeOrder, N, &Nodes))
    2216          11 :           R.insert(N.begin(), N.end());
    2217             :       }
    2218             :     }
    2219             :     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
    2220             :   }
    2221             : 
    2222             :   LLVM_DEBUG({
    2223             :     dbgs() << "Node order: ";
    2224             :     for (SUnit *I : NodeOrder)
    2225             :       dbgs() << " " << I->NodeNum << " ";
    2226             :     dbgs() << "\n";
    2227             :   });
    2228         191 : }
    2229             : 
    2230             : /// Process the nodes in the computed order and create the pipelined schedule
    2231             : /// of the instructions, if possible. Return true if a schedule is found.
    2232         191 : bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
    2233         191 :   if (NodeOrder.empty())
    2234             :     return false;
    2235             : 
    2236             :   bool scheduleFound = false;
    2237             :   // Keep increasing II until a valid schedule is found.
    2238         457 :   for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
    2239         274 :     Schedule.reset();
    2240         274 :     Schedule.setInitiationInterval(II);
    2241             :     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
    2242             : 
    2243             :     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
    2244             :     SetVector<SUnit *>::iterator NE = NodeOrder.end();
    2245             :     do {
    2246        3107 :       SUnit *SU = *NI;
    2247             : 
    2248             :       // Compute the schedule time for the instruction, which is based
    2249             :       // upon the scheduled time for any predecessors/successors.
    2250        3107 :       int EarlyStart = INT_MIN;
    2251        3107 :       int LateStart = INT_MAX;
    2252             :       // These values are set when the size of the schedule window is limited
    2253             :       // due to chain dependences.
    2254        3107 :       int SchedEnd = INT_MAX;
    2255        3107 :       int SchedStart = INT_MIN;
    2256        3107 :       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
    2257             :                             II, this);
    2258             :       LLVM_DEBUG({
    2259             :         dbgs() << "Inst (" << SU->NodeNum << ") ";
    2260             :         SU->getInstr()->dump();
    2261             :         dbgs() << "\n";
    2262             :       });
    2263             :       LLVM_DEBUG({
    2264             :         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
    2265             :                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
    2266             :       });
    2267             : 
    2268        6163 :       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
    2269        3056 :           SchedStart > LateStart)
    2270             :         scheduleFound = false;
    2271        3030 :       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
    2272        1828 :         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
    2273         914 :         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
    2274        2116 :       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
    2275        2674 :         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
    2276        1337 :         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
    2277         779 :       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
    2278         435 :         SchedEnd =
    2279        1305 :             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
    2280             :         // When scheduling a Phi it is better to start at the late cycle and go
    2281             :         // backwards. The default order may insert the Phi too far away from
    2282             :         // its first dependence.
    2283         435 :         if (SU->getInstr()->isPHI())
    2284         350 :           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
    2285             :         else
    2286          85 :           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
    2287             :       } else {
    2288         344 :         int FirstCycle = Schedule.getFirstCycle();
    2289         344 :         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
    2290         688 :                                         FirstCycle + getASAP(SU) + II - 1, II);
    2291             :       }
    2292             :       // Even if we find a schedule, make sure the schedule doesn't exceed the
    2293             :       // allowable number of stages. We keep trying if this happens.
    2294        3030 :       if (scheduleFound)
    2295        6036 :         if (SwpMaxStages > -1 &&
    2296        6036 :             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
    2297             :           scheduleFound = false;
    2298             : 
    2299             :       LLVM_DEBUG({
    2300             :         if (!scheduleFound)
    2301             :           dbgs() << "\tCan't schedule\n";
    2302             :       });
    2303        3107 :     } while (++NI != NE && scheduleFound);
    2304             : 
    2305             :     // If a schedule is found, check if it is a valid schedule too.
    2306         274 :     if (scheduleFound)
    2307         178 :       scheduleFound = Schedule.isValidSchedule(this);
    2308             :   }
    2309             : 
    2310             :   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
    2311             : 
    2312         183 :   if (scheduleFound)
    2313         178 :     Schedule.finalizeSchedule(this);
    2314             :   else
    2315           5 :     Schedule.reset();
    2316             : 
    2317         361 :   return scheduleFound && Schedule.getMaxStageCount() > 0;
    2318             : }
    2319             : 
    2320             : /// Given a schedule for the loop, generate a new version of the loop,
    2321             : /// and replace the old version.  This function generates a prolog
    2322             : /// that contains the initial iterations in the pipeline, and kernel
    2323             : /// loop, and the epilogue that contains the code for the final
    2324             : /// iterations.
    2325         130 : void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
    2326             :   // Create a new basic block for the kernel and add it to the CFG.
    2327         130 :   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
    2328             : 
    2329         130 :   unsigned MaxStageCount = Schedule.getMaxStageCount();
    2330             : 
    2331             :   // Remember the registers that are used in different stages. The index is
    2332             :   // the iteration, or stage, that the instruction is scheduled in.  This is
    2333             :   // a map between register names in the orignal block and the names created
    2334             :   // in each stage of the pipelined loop.
    2335         744 :   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
    2336             :   InstrMapTy InstrMap;
    2337             : 
    2338             :   SmallVector<MachineBasicBlock *, 4> PrologBBs;
    2339             :   // Generate the prolog instructions that set up the pipeline.
    2340         130 :   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
    2341         130 :   MF.insert(BB->getIterator(), KernelBB);
    2342             : 
    2343             :   // Rearrange the instructions to generate the new, pipelined loop,
    2344             :   // and update register names as needed.
    2345         487 :   for (int Cycle = Schedule.getFirstCycle(),
    2346         130 :            LastCycle = Schedule.getFinalCycle();
    2347         487 :        Cycle <= LastCycle; ++Cycle) {
    2348             :     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
    2349             :     // This inner loop schedules each instruction in the cycle.
    2350        1683 :     for (SUnit *CI : CycleInstrs) {
    2351        1612 :       if (CI->getInstr()->isPHI())
    2352         286 :         continue;
    2353        1040 :       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
    2354        1040 :       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
    2355        1040 :       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
    2356        1040 :       KernelBB->push_back(NewMI);
    2357        1040 :       InstrMap[NewMI] = CI->getInstr();
    2358             :     }
    2359             :   }
    2360             : 
    2361             :   // Copy any terminator instructions to the new kernel, and update
    2362             :   // names as needed.
    2363         130 :   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
    2364         130 :                                    E = BB->instr_end();
    2365         390 :        I != E; ++I) {
    2366         260 :     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
    2367         260 :     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
    2368         260 :     KernelBB->push_back(NewMI);
    2369         260 :     InstrMap[NewMI] = &*I;
    2370             :   }
    2371             : 
    2372         130 :   KernelBB->transferSuccessors(BB);
    2373         130 :   KernelBB->replaceSuccessor(BB, KernelBB);
    2374             : 
    2375         130 :   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
    2376             :                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
    2377         130 :   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
    2378             :                InstrMap, MaxStageCount, MaxStageCount, false);
    2379             : 
    2380             :   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
    2381             : 
    2382             :   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
    2383             :   // Generate the epilog instructions to complete the pipeline.
    2384         130 :   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
    2385             :                  PrologBBs);
    2386             : 
    2387             :   // We need this step because the register allocation doesn't handle some
    2388             :   // situations well, so we insert copies to help out.
    2389         130 :   splitLifetimes(KernelBB, EpilogBBs, Schedule);
    2390             : 
    2391             :   // Remove dead instructions due to loop induction variables.
    2392         130 :   removeDeadInstructions(KernelBB, EpilogBBs);
    2393             : 
    2394             :   // Add branches between prolog and epilog blocks.
    2395         130 :   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
    2396             : 
    2397             :   // Remove the original loop since it's no longer referenced.
    2398        1976 :   for (auto &I : *BB)
    2399        1586 :     LIS.RemoveMachineInstrFromMaps(I);
    2400         130 :   BB->clear();
    2401         130 :   BB->eraseFromParent();
    2402             : 
    2403         260 :   delete[] VRMap;
    2404         130 : }
    2405             : 
    2406             : /// Generate the pipeline prolog code.
    2407         130 : void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
    2408             :                                        MachineBasicBlock *KernelBB,
    2409             :                                        ValueMapTy *VRMap,
    2410             :                                        MBBVectorTy &PrologBBs) {
    2411         260 :   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
    2412             :   assert(PreheaderBB != nullptr &&
    2413             :          "Need to add code to handle loops w/o preheader");
    2414             :   MachineBasicBlock *PredBB = PreheaderBB;
    2415             :   InstrMapTy InstrMap;
    2416             : 
    2417             :   // Generate a basic block for each stage, not including the last stage,
    2418             :   // which will be generated in the kernel. Each basic block may contain
    2419             :   // instructions from multiple stages/iterations.
    2420         484 :   for (unsigned i = 0; i < LastStage; ++i) {
    2421             :     // Create and insert the prolog basic block prior to the original loop
    2422             :     // basic block.  The original loop is removed later.
    2423         177 :     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
    2424         177 :     PrologBBs.push_back(NewBB);
    2425         177 :     MF.insert(BB->getIterator(), NewBB);
    2426         177 :     NewBB->transferSuccessors(PredBB);
    2427         177 :     PredBB->addSuccessor(NewBB);
    2428         177 :     PredBB = NewBB;
    2429             : 
    2430             :     // Generate instructions for each appropriate stage. Process instructions
    2431             :     // in original program order.
    2432         409 :     for (int StageNum = i; StageNum >= 0; --StageNum) {
    2433         232 :       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
    2434         232 :                                        BBE = BB->getFirstTerminator();
    2435        2849 :            BBI != BBE; ++BBI) {
    2436        5234 :         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
    2437         283 :           if (BBI->isPHI())
    2438         283 :             continue;
    2439             :           MachineInstr *NewMI =
    2440         831 :               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
    2441         831 :           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
    2442             :                             VRMap);
    2443         831 :           NewBB->push_back(NewMI);
    2444         831 :           InstrMap[NewMI] = &*BBI;
    2445             :         }
    2446             :       }
    2447             :     }
    2448         177 :     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
    2449             :     LLVM_DEBUG({
    2450             :       dbgs() << "prolog:\n";
    2451             :       NewBB->dump();
    2452             :     });
    2453             :   }
    2454             : 
    2455         130 :   PredBB->replaceSuccessor(BB, KernelBB);
    2456             : 
    2457             :   // Check if we need to remove the branch from the preheader to the original
    2458             :   // loop, and replace it with a branch to the new loop.
    2459         130 :   unsigned numBranches = TII->removeBranch(*PreheaderBB);
    2460         130 :   if (numBranches) {
    2461             :     SmallVector<MachineOperand, 0> Cond;
    2462          93 :     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
    2463             :   }
    2464         130 : }
    2465             : 
    2466             : /// Generate the pipeline epilog code. The epilog code finishes the iterations
    2467             : /// that were started in either the prolog or the kernel.  We create a basic
    2468             : /// block for each stage that needs to complete.
    2469         130 : void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
    2470             :                                        MachineBasicBlock *KernelBB,
    2471             :                                        ValueMapTy *VRMap,
    2472             :                                        MBBVectorTy &EpilogBBs,
    2473             :                                        MBBVectorTy &PrologBBs) {
    2474             :   // We need to change the branch from the kernel to the first epilog block, so
    2475             :   // this call to analyze branch uses the kernel rather than the original BB.
    2476         130 :   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
    2477             :   SmallVector<MachineOperand, 4> Cond;
    2478         130 :   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
    2479             :   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
    2480         130 :   if (checkBranch)
    2481             :     return;
    2482             : 
    2483             :   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
    2484         130 :   if (*LoopExitI == KernelBB)
    2485             :     ++LoopExitI;
    2486             :   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
    2487         130 :   MachineBasicBlock *LoopExitBB = *LoopExitI;
    2488             : 
    2489             :   MachineBasicBlock *PredBB = KernelBB;
    2490             :   MachineBasicBlock *EpilogStart = LoopExitBB;
    2491             :   InstrMapTy InstrMap;
    2492             : 
    2493             :   // Generate a basic block for each stage, not including the last stage,
    2494             :   // which was generated for the kernel.  Each basic block may contain
    2495             :   // instructions from multiple stages/iterations.
    2496         130 :   int EpilogStage = LastStage + 1;
    2497         484 :   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
    2498         177 :     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
    2499         177 :     EpilogBBs.push_back(NewBB);
    2500         177 :     MF.insert(BB->getIterator(), NewBB);
    2501             : 
    2502         177 :     PredBB->replaceSuccessor(LoopExitBB, NewBB);
    2503         177 :     NewBB->addSuccessor(LoopExitBB);
    2504             : 
    2505         177 :     if (EpilogStart == LoopExitBB)
    2506         130 :       EpilogStart = NewBB;
    2507             : 
    2508             :     // Add instructions to the epilog depending on the current block.
    2509             :     // Process instructions in original program order.
    2510         641 :     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
    2511        3777 :       for (auto &BBI : *BB) {
    2512         564 :         if (BBI.isPHI())
    2513         564 :           continue;
    2514             :         MachineInstr *In = &BBI;
    2515        2517 :         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
    2516             :           // Instructions with memoperands in the epilog are updated with
    2517             :           // conservative values.
    2518         690 :           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
    2519         690 :           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
    2520         690 :           NewBB->push_back(NewMI);
    2521         690 :           InstrMap[NewMI] = In;
    2522             :         }
    2523             :       }
    2524             :     }
    2525         354 :     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
    2526             :                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
    2527         177 :     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
    2528             :                  InstrMap, LastStage, EpilogStage, i == 1);
    2529         177 :     PredBB = NewBB;
    2530             : 
    2531             :     LLVM_DEBUG({
    2532             :       dbgs() << "epilog:\n";
    2533             :       NewBB->dump();
    2534             :     });
    2535             :   }
    2536             : 
    2537             :   // Fix any Phi nodes in the loop exit block.
    2538         274 :   for (MachineInstr &MI : *LoopExitBB) {
    2539             :     if (!MI.isPHI())
    2540             :       break;
    2541          45 :     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
    2542          31 :       MachineOperand &MO = MI.getOperand(i);
    2543          31 :       if (MO.getMBB() == BB)
    2544             :         MO.setMBB(PredBB);
    2545             :     }
    2546             :   }
    2547             : 
    2548             :   // Create a branch to the new epilog from the kernel.
    2549             :   // Remove the original branch and add a new branch to the epilog.
    2550         130 :   TII->removeBranch(*KernelBB);
    2551         390 :   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
    2552             :   // Add a branch to the loop exit.
    2553         130 :   if (EpilogBBs.size() > 0) {
    2554         130 :     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
    2555             :     SmallVector<MachineOperand, 4> Cond1;
    2556         390 :     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
    2557             :   }
    2558             : }
    2559             : 
    2560             : /// Replace all uses of FromReg that appear outside the specified
    2561             : /// basic block with ToReg.
    2562        1015 : static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
    2563             :                                     MachineBasicBlock *MBB,
    2564             :                                     MachineRegisterInfo &MRI,
    2565             :                                     LiveIntervals &LIS) {
    2566        1015 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
    2567             :                                          E = MRI.use_end();
    2568        2450 :        I != E;) {
    2569             :     MachineOperand &O = *I;
    2570             :     ++I;
    2571        1435 :     if (O.getParent()->getParent() != MBB)
    2572          72 :       O.setReg(ToReg);
    2573             :   }
    2574             :   if (!LIS.hasInterval(ToReg))
    2575        1015 :     LIS.createEmptyInterval(ToReg);
    2576        1015 : }
    2577             : 
    2578             : /// Return true if the register has a use that occurs outside the
    2579             : /// specified loop.
    2580         395 : static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
    2581             :                             MachineRegisterInfo &MRI) {
    2582         395 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
    2583             :                                          E = MRI.use_end();
    2584         831 :        I != E; ++I)
    2585         444 :     if (I->getParent()->getParent() != BB)
    2586             :       return true;
    2587             :   return false;
    2588             : }
    2589             : 
    2590             : /// Generate Phis for the specific block in the generated pipelined code.
    2591             : /// This function looks at the Phis from the original code to guide the
    2592             : /// creation of new Phis.
    2593         307 : void SwingSchedulerDAG::generateExistingPhis(
    2594             :     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
    2595             :     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
    2596             :     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
    2597             :     bool IsLast) {
    2598             :   // Compute the stage number for the initial value of the Phi, which
    2599             :   // comes from the prolog. The prolog to use depends on to which kernel/
    2600             :   // epilog that we're adding the Phi.
    2601             :   unsigned PrologStage = 0;
    2602             :   unsigned PrevStage = 0;
    2603         307 :   bool InKernel = (LastStageNum == CurStageNum);
    2604         307 :   if (InKernel) {
    2605         130 :     PrologStage = LastStageNum - 1;
    2606             :     PrevStage = CurStageNum;
    2607             :   } else {
    2608         177 :     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
    2609         177 :     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
    2610             :   }
    2611             : 
    2612         307 :   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
    2613         307 :                                    BBE = BB->getFirstNonPHI();
    2614        1010 :        BBI != BBE; ++BBI) {
    2615         703 :     unsigned Def = BBI->getOperand(0).getReg();
    2616             : 
    2617             :     unsigned InitVal = 0;
    2618         703 :     unsigned LoopVal = 0;
    2619         703 :     getPhiRegs(*BBI, BB, InitVal, LoopVal);
    2620             : 
    2621         703 :     unsigned PhiOp1 = 0;
    2622             :     // The Phi value from the loop body typically is defined in the loop, but
    2623             :     // not always. So, we need to check if the value is defined in the loop.
    2624         703 :     unsigned PhiOp2 = LoopVal;
    2625         703 :     if (VRMap[LastStageNum].count(LoopVal))
    2626         690 :       PhiOp2 = VRMap[LastStageNum][LoopVal];
    2627             : 
    2628         703 :     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
    2629             :     int LoopValStage =
    2630        1406 :         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
    2631         703 :     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
    2632         703 :     if (NumStages == 0) {
    2633             :       // We don't need to generate a Phi anymore, but we need to rename any uses
    2634             :       // of the Phi value.
    2635           6 :       unsigned NewReg = VRMap[PrevStage][LoopVal];
    2636           6 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
    2637             :                             Def, InitVal, NewReg);
    2638           3 :       if (VRMap[CurStageNum].count(LoopVal))
    2639           3 :         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
    2640             :     }
    2641             :     // Adjust the number of Phis needed depending on the number of prologs left,
    2642             :     // and the distance from where the Phi is first scheduled. The number of
    2643             :     // Phis cannot exceed the number of prolog stages. Each stage can
    2644             :     // potentially define two values.
    2645         703 :     unsigned MaxPhis = PrologStage + 2;
    2646         703 :     if (!InKernel && (int)PrologStage <= LoopValStage)
    2647         694 :       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
    2648         703 :     unsigned NumPhis = std::min(NumStages, MaxPhis);
    2649             : 
    2650             :     unsigned NewReg = 0;
    2651         703 :     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
    2652             :     // In the epilog, we may need to look back one stage to get the correct
    2653             :     // Phi name because the epilog and prolog blocks execute the same stage.
    2654             :     // The correct name is from the previous block only when the Phi has
    2655             :     // been completely scheduled prior to the epilog, and Phi value is not
    2656             :     // needed in multiple stages.
    2657             :     int StageDiff = 0;
    2658        1100 :     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
    2659         397 :         NumPhis == 1)
    2660             :       StageDiff = 1;
    2661             :     // Adjust the computations below when the phi and the loop definition
    2662             :     // are scheduled in different stages.
    2663         703 :     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
    2664           0 :       StageDiff = StageScheduled - LoopValStage;
    2665        2247 :     for (unsigned np = 0; np < NumPhis; ++np) {
    2666             :       // If the Phi hasn't been scheduled, then use the initial Phi operand
    2667             :       // value. Otherwise, use the scheduled version of the instruction. This
    2668             :       // is a little complicated when a Phi references another Phi.
    2669         772 :       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
    2670         240 :         PhiOp1 = InitVal;
    2671             :       // Check if the Phi has already been scheduled in a prolog stage.
    2672         532 :       else if (PrologStage >= AccessStage + StageDiff + np &&
    2673         338 :                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
    2674         306 :         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
    2675             :       // Check if the Phi has already been scheduled, but the loop intruction
    2676             :       // is either another Phi, or doesn't occur in the loop.
    2677         226 :       else if (PrologStage >= AccessStage + StageDiff + np) {
    2678             :         // If the Phi references another Phi, we need to examine the other
    2679             :         // Phi to get the correct value.
    2680          32 :         PhiOp1 = LoopVal;
    2681          32 :         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
    2682             :         int Indirects = 1;
    2683         102 :         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
    2684          29 :           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
    2685          29 :           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
    2686          38 :             PhiOp1 = getInitPhiReg(*InstOp1, BB);
    2687             :           else
    2688          20 :             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
    2689          29 :           InstOp1 = MRI.getVRegDef(PhiOp1);
    2690          29 :           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
    2691          29 :           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
    2692          29 :           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
    2693          10 :               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
    2694          10 :             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
    2695          10 :             break;
    2696             :           }
    2697          19 :           ++Indirects;
    2698             :         }
    2699             :       } else
    2700         194 :         PhiOp1 = InitVal;
    2701             :       // If this references a generated Phi in the kernel, get the Phi operand
    2702             :       // from the incoming block.
    2703         772 :       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
    2704          65 :         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
    2705           2 :           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
    2706             : 
    2707         772 :       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
    2708         772 :       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
    2709             :       // In the epilog, a map lookup is needed to get the value from the kernel,
    2710             :       // or previous epilog block. How is does this depends on if the
    2711             :       // instruction is scheduled in the previous block.
    2712         772 :       if (!InKernel) {
    2713             :         int StageDiffAdj = 0;
    2714         458 :         if (LoopValStage != -1 && StageScheduled > LoopValStage)
    2715           0 :           StageDiffAdj = StageScheduled - LoopValStage;
    2716             :         // Use the loop value defined in the kernel, unless the kernel
    2717             :         // contains the last definition of the Phi.
    2718         744 :         if (np == 0 && PrevStage == LastStageNum &&
    2719         458 :             (StageScheduled != 0 || LoopValStage != 0) &&
    2720         107 :             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
    2721         105 :           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
    2722             :         // Use the value defined by the Phi. We add one because we switch
    2723             :         // from looking at the loop value to the Phi definition.
    2724         353 :         else if (np > 0 && PrevStage == LastStageNum &&
    2725          31 :                  VRMap[PrevStage - np + 1].count(Def))
    2726          31 :           PhiOp2 = VRMap[PrevStage - np + 1][Def];
    2727             :         // Use the loop value defined in the kernel.
    2728         322 :         else if ((unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 &&
    2729          43 :                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
    2730          40 :           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
    2731             :         // Use the value defined by the Phi, unless we're generating the first
    2732             :         // epilog and the Phi refers to a Phi in a different stage.
    2733         564 :         else if (VRMap[PrevStage - np].count(Def) &&
    2734         282 :                  (!LoopDefIsPhi || PrevStage != LastStageNum))
    2735         265 :           PhiOp2 = VRMap[PrevStage - np][Def];
    2736             :       }
    2737             : 
    2738             :       // Check if we can reuse an existing Phi. This occurs when a Phi
    2739             :       // references another Phi, and the other Phi is scheduled in an
    2740             :       // earlier stage. We can try to reuse an existing Phi up until the last
    2741             :       // stage of the current Phi.
    2742         772 :       if (LoopDefIsPhi && (int)(PrologStage - np) >= StageScheduled) {
    2743          94 :         int LVNumStages = Schedule.getStagesForPhi(LoopVal);
    2744          47 :         int StageDiff = (StageScheduled - LoopValStage);
    2745          47 :         LVNumStages -= StageDiff;
    2746             :         // Make sure the loop value Phi has been processed already.
    2747          47 :         if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
    2748             :           NewReg = PhiOp2;
    2749             :           unsigned ReuseStage = CurStageNum;
    2750           0 :           if (Schedule.isLoopCarried(this, *PhiInst))
    2751           0 :             ReuseStage -= LVNumStages;
    2752             :           // Check if the Phi to reuse has been generated yet. If not, then
    2753             :           // there is nothing to reuse.
    2754           0 :           if (VRMap[ReuseStage - np].count(LoopVal)) {
    2755           0 :             NewReg = VRMap[ReuseStage - np][LoopVal];
    2756             : 
    2757           0 :             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    2758             :                                   &*BBI, Def, NewReg);
    2759             :             // Update the map with the new Phi name.
    2760           0 :             VRMap[CurStageNum - np][Def] = NewReg;
    2761             :             PhiOp2 = NewReg;
    2762           0 :             if (VRMap[LastStageNum - np - 1].count(LoopVal))
    2763           0 :               PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
    2764             : 
    2765           0 :             if (IsLast && np == NumPhis - 1)
    2766           0 :               replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    2767           0 :             continue;
    2768             :           }
    2769          47 :         } else if (InKernel && StageDiff > 0 &&
    2770           1 :                    VRMap[CurStageNum - StageDiff - np].count(LoopVal))
    2771           0 :           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
    2772             :       }
    2773             : 
    2774         772 :       const TargetRegisterClass *RC = MRI.getRegClass(Def);
    2775         772 :       NewReg = MRI.createVirtualRegister(RC);
    2776             : 
    2777             :       MachineInstrBuilder NewPhi =
    2778        1544 :           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
    2779        1544 :                   TII->get(TargetOpcode::PHI), NewReg);
    2780         772 :       NewPhi.addReg(PhiOp1).addMBB(BB1);
    2781         772 :       NewPhi.addReg(PhiOp2).addMBB(BB2);
    2782         772 :       if (np == 0)
    2783        1400 :         InstrMap[NewPhi] = &*BBI;
    2784             : 
    2785             :       // We define the Phis after creating the new pipelined code, so
    2786             :       // we need to rename the Phi values in scheduled instructions.
    2787             : 
    2788             :       unsigned PrevReg = 0;
    2789         772 :       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
    2790         302 :         PrevReg = VRMap[PrevStage - np][LoopVal];
    2791        1544 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
    2792             :                             Def, NewReg, PrevReg);
    2793             :       // If the Phi has been scheduled, use the new name for rewriting.
    2794         772 :       if (VRMap[CurStageNum - np].count(Def)) {
    2795          41 :         unsigned R = VRMap[CurStageNum - np][Def];
    2796          41 :         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
    2797             :                               R, NewReg);
    2798             :       }
    2799             : 
    2800             :       // Check if we need to rename any uses that occurs after the loop. The
    2801             :       // register to replace depends on whether the Phi is scheduled in the
    2802             :       // epilog.
    2803         772 :       if (IsLast && np == NumPhis - 1)
    2804         286 :         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    2805             : 
    2806             :       // In the kernel, a dependent Phi uses the value from this Phi.
    2807         772 :       if (InKernel)
    2808             :         PhiOp2 = NewReg;
    2809             : 
    2810             :       // Update the map with the new Phi name.
    2811         772 :       VRMap[CurStageNum - np][Def] = NewReg;
    2812             :     }
    2813             : 
    2814           8 :     while (NumPhis++ < NumStages) {
    2815          16 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
    2816             :                             &*BBI, Def, NewReg, 0);
    2817             :     }
    2818             : 
    2819             :     // Check if we need to rename a Phi that has been eliminated due to
    2820             :     // scheduling.
    2821         703 :     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
    2822           0 :       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
    2823             :   }
    2824         307 : }
    2825             : 
    2826             : /// Generate Phis for the specified block in the generated pipelined code.
    2827             : /// These are new Phis needed because the definition is scheduled after the
    2828             : /// use in the pipelened sequence.
    2829         307 : void SwingSchedulerDAG::generatePhis(
    2830             :     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
    2831             :     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
    2832             :     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
    2833             :     bool IsLast) {
    2834             :   // Compute the stage number that contains the initial Phi value, and
    2835             :   // the Phi from the previous stage.
    2836             :   unsigned PrologStage = 0;
    2837             :   unsigned PrevStage = 0;
    2838         307 :   unsigned StageDiff = CurStageNum - LastStageNum;
    2839             :   bool InKernel = (StageDiff == 0);
    2840         307 :   if (InKernel) {
    2841         130 :     PrologStage = LastStageNum - 1;
    2842             :     PrevStage = CurStageNum;
    2843             :   } else {
    2844         177 :     PrologStage = LastStageNum - StageDiff;
    2845         177 :     PrevStage = LastStageNum + StageDiff - 1;
    2846             :   }
    2847             : 
    2848         307 :   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
    2849         307 :                                    BBE = BB->instr_end();
    2850        3482 :        BBI != BBE; ++BBI) {
    2851       14042 :     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
    2852       10867 :       MachineOperand &MO = BBI->getOperand(i);
    2853       23066 :       if (!MO.isReg() || !MO.isDef() ||
    2854        3564 :           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    2855       17208 :         continue;
    2856             : 
    2857        2566 :       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
    2858             :       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
    2859        2566 :       unsigned Def = MO.getReg();
    2860        2566 :       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
    2861             :       // An instruction scheduled in stage 0 and is used after the loop
    2862             :       // requires a phi in the epilog for the last definition from either
    2863             :       // the kernel or prolog.
    2864        2961 :       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
    2865         395 :           hasUseAfterLoop(Def, BB, MRI))
    2866             :         NumPhis = 1;
    2867        2566 :       if (!InKernel && (unsigned)StageScheduled > PrologStage)
    2868         606 :         continue;
    2869             : 
    2870        3920 :       unsigned PhiOp2 = VRMap[PrevStage][Def];
    2871        1960 :       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
    2872          94 :         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
    2873             :           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
    2874             :       // The number of Phis can't exceed the number of prolog stages. The
    2875             :       // prolog stage number is zero based.
    2876        1960 :       if (NumPhis > PrologStage + 1 - StageScheduled)
    2877             :         NumPhis = PrologStage + 1 - StageScheduled;
    2878        3368 :       for (unsigned np = 0; np < NumPhis; ++np) {
    2879        1408 :         unsigned PhiOp1 = VRMap[PrologStage][Def];
    2880         704 :         if (np <= PrologStage)
    2881        1408 :           PhiOp1 = VRMap[PrologStage - np][Def];
    2882         704 :         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
    2883         317 :           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
    2884             :             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
    2885         317 :           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
    2886             :             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
    2887             :         }
    2888         704 :         if (!InKernel)
    2889         806 :           PhiOp2 = VRMap[PrevStage - np][Def];
    2890             : 
    2891         704 :         const TargetRegisterClass *RC = MRI.getRegClass(Def);
    2892         704 :         unsigned NewReg = MRI.createVirtualRegister(RC);
    2893             : 
    2894             :         MachineInstrBuilder NewPhi =
    2895        1408 :             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
    2896        1408 :                     TII->get(TargetOpcode::PHI), NewReg);
    2897         704 :         NewPhi.addReg(PhiOp1).addMBB(BB1);
    2898         704 :         NewPhi.addReg(PhiOp2).addMBB(BB2);
    2899         704 :         if (np == 0)
    2900        1340 :           InstrMap[NewPhi] = &*BBI;
    2901             : 
    2902             :         // Rewrite uses and update the map. The actions depend upon whether
    2903             :         // we generating code for the kernel or epilog blocks.
    2904         704 :         if (InKernel) {
    2905         301 :           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    2906             :                                 &*BBI, PhiOp1, NewReg);
    2907         301 :           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    2908             :                                 &*BBI, PhiOp2, NewReg);
    2909             : 
    2910             :           PhiOp2 = NewReg;
    2911         602 :           VRMap[PrevStage - np - 1][Def] = NewReg;
    2912             :         } else {
    2913         806 :           VRMap[CurStageNum - np][Def] = NewReg;
    2914         403 :           if (np == NumPhis - 1)
    2915         770 :             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    2916             :                                   &*BBI, Def, NewReg);
    2917             :         }
    2918         704 :         if (IsLast && np == NumPhis - 1)
    2919         229 :           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    2920             :       }
    2921             :     }
    2922             :   }
    2923         307 : }
    2924             : 
    2925             : /// Remove instructions that generate values with no uses.
    2926             : /// Typically, these are induction variable operations that generate values
    2927             : /// used in the loop itself.  A dead instruction has a definition with
    2928             : /// no uses, or uses that occur in the original loop only.
    2929         130 : void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
    2930             :                                                MBBVectorTy &EpilogBBs) {
    2931             :   // For each epilog block, check that the value defined by each instruction
    2932             :   // is used.  If not, delete it.
    2933         130 :   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
    2934         130 :                                      MBE = EpilogBBs.rend();
    2935         307 :        MBB != MBE; ++MBB)
    2936         177 :     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
    2937             :                                                    ME = (*MBB)->instr_rend();
    2938        1858 :          MI != ME;) {
    2939             :       // From DeadMachineInstructionElem. Don't delete inline assembly.
    2940        1681 :       if (MI->isInlineAsm()) {
    2941             :         ++MI;
    2942         628 :         continue;
    2943             :       }
    2944        1681 :       bool SawStore = false;
    2945             :       // Check if it's safe to remove the instruction due to side effects.
    2946             :       // We can, and want to, remove Phis here.
    2947        2013 :       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
    2948             :         ++MI;
    2949         332 :         continue;
    2950             :       }
    2951             :       bool used = true;
    2952        2785 :       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
    2953        1349 :                                       MOE = MI->operands_end();
    2954        2785 :            MOI != MOE; ++MOI) {
    2955        5564 :         if (!MOI->isReg() || !MOI->isDef())
    2956        1140 :           continue;
    2957        1349 :         unsigned reg = MOI->getReg();
    2958             :         // Assume physical registers are used, unless they are marked dead.
    2959        1349 :         if (TargetRegisterInfo::isPhysicalRegister(reg)) {
    2960           0 :           used = !MOI->isDead();
    2961           0 :           if (used)
    2962             :             break;
    2963           0 :           continue;
    2964             :         }
    2965             :         unsigned realUses = 0;
    2966        1349 :         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
    2967             :                                                EI = MRI.use_end();
    2968        1349 :              UI != EI; ++UI) {
    2969             :           // Check if there are any uses that occur only in the original
    2970             :           // loop.  If so, that's not a real use.
    2971        1053 :           if (UI->getParent()->getParent() != BB) {
    2972             :             realUses++;
    2973             :             used = true;
    2974             :             break;
    2975             :           }
    2976             :         }
    2977        1349 :         if (realUses > 0)
    2978             :           break;
    2979             :         used = false;
    2980             :       }
    2981        1645 :       if (!used) {
    2982         296 :         LIS.RemoveMachineInstrFromMaps(*MI);
    2983         296 :         MI++->eraseFromParent();
    2984         296 :         continue;
    2985             :       }
    2986             :       ++MI;
    2987             :     }
    2988             :   // In the kernel block, check if we can remove a Phi that generates a value
    2989             :   // used in an instruction removed in the epilog block.
    2990             :   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
    2991         130 :                                    BBE = KernelBB->getFirstNonPHI();
    2992         745 :        BBI != BBE;) {
    2993             :     MachineInstr *MI = &*BBI;
    2994             :     ++BBI;
    2995         615 :     unsigned reg = MI->getOperand(0).getReg();
    2996        1230 :     if (MRI.use_begin(reg) == MRI.use_end()) {
    2997           0 :       LIS.RemoveMachineInstrFromMaps(*MI);
    2998           0 :       MI->eraseFromParent();
    2999             :     }
    3000             :   }
    3001         130 : }
    3002             : 
    3003             : /// For loop carried definitions, we split the lifetime of a virtual register
    3004             : /// that has uses past the definition in the next iteration. A copy with a new
    3005             : /// virtual register is inserted before the definition, which helps with
    3006             : /// generating a better register assignment.
    3007             : ///
    3008             : ///   v1 = phi(a, v2)     v1 = phi(a, v2)
    3009             : ///   v2 = phi(b, v3)     v2 = phi(b, v3)
    3010             : ///   v3 = ..             v4 = copy v1
    3011             : ///   .. = V1             v3 = ..
    3012             : ///                       .. = v4
    3013         130 : void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
    3014             :                                        MBBVectorTy &EpilogBBs,
    3015             :                                        SMSchedule &Schedule) {
    3016         130 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3017         875 :   for (auto &PHI : KernelBB->phis()) {
    3018         615 :     unsigned Def = PHI.getOperand(0).getReg();
    3019             :     // Check for any Phi definition that used as an operand of another Phi
    3020             :     // in the same block.
    3021        1606 :     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
    3022             :                                                  E = MRI.use_instr_end();
    3023        1606 :          I != E; ++I) {
    3024         238 :       if (I->isPHI() && I->getParent() == KernelBB) {
    3025             :         // Get the loop carried definition.
    3026             :         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
    3027          60 :         if (!LCDef)
    3028             :           continue;
    3029          60 :         MachineInstr *MI = MRI.getVRegDef(LCDef);
    3030          67 :         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
    3031             :           continue;
    3032             :         // Search through the rest of the block looking for uses of the Phi
    3033             :         // definition. If one occurs, then split the lifetime.
    3034             :         unsigned SplitReg = 0;
    3035             :         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
    3036         415 :                                     KernelBB->instr_end()))
    3037         362 :           if (BBJ.readsRegister(Def)) {
    3038             :             // We split the lifetime when we find the first use.
    3039          38 :             if (SplitReg == 0) {
    3040          52 :               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
    3041          52 :               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
    3042          26 :                       TII->get(TargetOpcode::COPY), SplitReg)
    3043          26 :                   .addReg(Def);
    3044             :             }
    3045          38 :             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
    3046             :           }
    3047          53 :         if (!SplitReg)
    3048             :           continue;
    3049             :         // Search through each of the epilog blocks for any uses to be renamed.
    3050         104 :         for (auto &Epilog : EpilogBBs)
    3051         716 :           for (auto &I : *Epilog)
    3052         599 :             if (I.readsRegister(Def))
    3053          49 :               I.substituteRegister(Def, SplitReg, 0, *TRI);
    3054             :         break;
    3055             :       }
    3056             :     }
    3057             :   }
    3058         130 : }
    3059             : 
    3060             : /// Remove the incoming block from the Phis in a basic block.
    3061          43 : static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
    3062         221 :   for (MachineInstr &MI : *BB) {
    3063             :     if (!MI.isPHI())
    3064             :       break;
    3065         151 :     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
    3066         302 :       if (MI.getOperand(i + 1).getMBB() == Incoming) {
    3067         135 :         MI.RemoveOperand(i + 1);
    3068         135 :         MI.RemoveOperand(i);
    3069         135 :         break;
    3070             :       }
    3071             :   }
    3072          43 : }
    3073             : 
    3074             : /// Create branches from each prolog basic block to the appropriate epilog
    3075             : /// block.  These edges are needed if the loop ends before reaching the
    3076             : /// kernel.
    3077         130 : void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
    3078             :                                     MachineBasicBlock *KernelBB,
    3079             :                                     MBBVectorTy &EpilogBBs,
    3080             :                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
    3081             :   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
    3082         130 :   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
    3083         130 :   MachineInstr *Cmp = Pass.LI.LoopCompare;
    3084             :   MachineBasicBlock *LastPro = KernelBB;
    3085             :   MachineBasicBlock *LastEpi = KernelBB;
    3086             : 
    3087             :   // Start from the blocks connected to the kernel and work "out"
    3088             :   // to the first prolog and the last epilog blocks.
    3089             :   SmallVector<MachineInstr *, 4> PrevInsts;
    3090         130 :   unsigned MaxIter = PrologBBs.size() - 1;
    3091             :   unsigned LC = UINT_MAX;
    3092             :   unsigned LCMin = UINT_MAX;
    3093         484 :   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
    3094             :     // Add branches to the prolog that go to the corresponding
    3095             :     // epilog, and the fall-thru prolog/kernel block.
    3096         354 :     MachineBasicBlock *Prolog = PrologBBs[j];
    3097         354 :     MachineBasicBlock *Epilog = EpilogBBs[i];
    3098             :     // We've executed one iteration, so decrement the loop count and check for
    3099             :     // the loop end.
    3100             :     SmallVector<MachineOperand, 4> Cond;
    3101             :     // Check if the LOOP0 has already been removed. If so, then there is no need
    3102             :     // to reduce the trip count.
    3103         177 :     if (LC != 0)
    3104         352 :       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
    3105         176 :                                 MaxIter);
    3106             : 
    3107             :     // Record the value of the first trip count, which is used to determine if
    3108             :     // branches and blocks can be removed for constant trip counts.
    3109         177 :     if (LCMin == UINT_MAX)
    3110             :       LCMin = LC;
    3111             : 
    3112             :     unsigned numAdded = 0;
    3113         177 :     if (TargetRegisterInfo::isVirtualRegister(LC)) {
    3114         134 :       Prolog->addSuccessor(Epilog);
    3115         402 :       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
    3116          43 :     } else if (j >= LCMin) {
    3117           5 :       Prolog->addSuccessor(Epilog);
    3118           5 :       Prolog->removeSuccessor(LastPro);
    3119           5 :       LastEpi->removeSuccessor(Epilog);
    3120          15 :       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
    3121           5 :       removePhis(Epilog, LastEpi);
    3122             :       // Remove the blocks that are no longer referenced.
    3123           5 :       if (LastPro != LastEpi) {
    3124             :         LastEpi->clear();
    3125           1 :         LastEpi->eraseFromParent();
    3126             :       }
    3127             :       LastPro->clear();
    3128           5 :       LastPro->eraseFromParent();
    3129             :     } else {
    3130         114 :       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
    3131          38 :       removePhis(Epilog, Prolog);
    3132             :     }
    3133             :     LastPro = Prolog;
    3134             :     LastEpi = Epilog;
    3135         311 :     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
    3136             :                                                    E = Prolog->instr_rend();
    3137         488 :          I != E && numAdded > 0; ++I, --numAdded)
    3138         311 :       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
    3139             :   }
    3140         130 : }
    3141             : 
    3142             : /// Return true if we can compute the amount the instruction changes
    3143             : /// during each iteration. Set Delta to the amount of the change.
    3144        1156 : bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
    3145        1156 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3146             :   unsigned BaseReg;
    3147             :   int64_t Offset;
    3148        1156 :   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
    3149             :     return false;
    3150             : 
    3151        1015 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3152             :   // Check if there is a Phi. If so, get the definition in the loop.
    3153        1015 :   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
    3154        1015 :   if (BaseDef && BaseDef->isPHI()) {
    3155        1846 :     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
    3156         923 :     BaseDef = MRI.getVRegDef(BaseReg);
    3157             :   }
    3158        1015 :   if (!BaseDef)
    3159             :     return false;
    3160             : 
    3161        1000 :   int D = 0;
    3162        1000 :   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
    3163             :     return false;
    3164             : 
    3165         916 :   Delta = D;
    3166             :   return true;
    3167             : }
    3168             : 
    3169             : /// Update the memory operand with a new offset when the pipeliner
    3170             : /// generates a new copy of the instruction that refers to a
    3171             : /// different memory location.
    3172        2561 : void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
    3173             :                                           MachineInstr &OldMI, unsigned Num) {
    3174        2561 :   if (Num == 0)
    3175             :     return;
    3176             :   // If the instruction has memory operands, then adjust the offset
    3177             :   // when the instruction appears in different stages.
    3178        3042 :   unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin();
    3179        1521 :   if (NumRefs == 0)
    3180             :     return;
    3181         483 :   MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs);
    3182             :   unsigned Refs = 0;
    3183         899 :   for (MachineMemOperand *MMO : NewMI.memoperands()) {
    3184        1932 :     if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
    3185             :         (!MMO->getValue())) {
    3186           0 :       NewMemRefs[Refs++] = MMO;
    3187           0 :       continue;
    3188             :     }
    3189             :     unsigned Delta;
    3190         765 :     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
    3191         208 :       int64_t AdjOffset = Delta * Num;
    3192         416 :       NewMemRefs[Refs++] =
    3193         416 :           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize());
    3194             :     } else {
    3195             :       NewMI.dropMemRefs();
    3196         275 :       return;
    3197             :     }
    3198             :   }
    3199         208 :   NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
    3200             : }
    3201             : 
    3202             : /// Clone the instruction for the new pipelined loop and update the
    3203             : /// memory operands, if needed.
    3204        1730 : MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
    3205             :                                             unsigned CurStageNum,
    3206             :                                             unsigned InstStageNum) {
    3207        1730 :   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
    3208             :   // Check for tied operands in inline asm instructions. This should be handled
    3209             :   // elsewhere, but I'm not sure of the best solution.
    3210        1730 :   if (OldMI->isInlineAsm())
    3211           0 :     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
    3212           0 :       const auto &MO = OldMI->getOperand(i);
    3213           0 :       if (MO.isReg() && MO.isUse())
    3214             :         break;
    3215             :       unsigned UseIdx;
    3216           0 :       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
    3217           0 :         NewMI->tieOperands(i, UseIdx);
    3218             :     }
    3219        1730 :   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
    3220        1730 :   return NewMI;
    3221             : }
    3222             : 
    3223             : /// Clone the instruction for the new pipelined loop. If needed, this
    3224             : /// function updates the instruction using the values saved in the
    3225             : /// InstrChanges structure.
    3226         831 : MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
    3227             :                                                      unsigned CurStageNum,
    3228             :                                                      unsigned InstStageNum,
    3229             :                                                      SMSchedule &Schedule) {
    3230         831 :   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
    3231             :   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    3232        1662 :       InstrChanges.find(getSUnit(OldMI));
    3233         831 :   if (It != InstrChanges.end()) {
    3234          29 :     std::pair<unsigned, int64_t> RegAndOffset = It->second;
    3235             :     unsigned BasePos, OffsetPos;
    3236          29 :     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
    3237           0 :       return nullptr;
    3238          58 :     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
    3239          29 :     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
    3240          29 :     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
    3241          28 :       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
    3242          29 :     NewMI->getOperand(OffsetPos).setImm(NewOffset);
    3243             :   }
    3244         831 :   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
    3245         831 :   return NewMI;
    3246             : }
    3247             : 
    3248             : /// Update the machine instruction with new virtual registers.  This
    3249             : /// function may change the defintions and/or uses.
    3250        3132 : void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
    3251             :                                           unsigned CurStageNum,
    3252             :                                           unsigned InstrStageNum,
    3253             :                                           SMSchedule &Schedule,
    3254             :                                           ValueMapTy *VRMap) {
    3255       13516 :   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
    3256       10384 :     MachineOperand &MO = NewMI->getOperand(i);
    3257       18579 :     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    3258        3266 :       continue;
    3259        7118 :     unsigned reg = MO.getReg();
    3260        7118 :     if (MO.isDef()) {
    3261             :       // Create a new virtual register for the definition.
    3262        2566 :       const TargetRegisterClass *RC = MRI.getRegClass(reg);
    3263        2566 :       unsigned NewReg = MRI.createVirtualRegister(RC);
    3264        2566 :       MO.setReg(NewReg);
    3265        5132 :       VRMap[CurStageNum][reg] = NewReg;
    3266        2566 :       if (LastDef)
    3267         500 :         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
    3268             :     } else if (MO.isUse()) {
    3269        4552 :       MachineInstr *Def = MRI.getVRegDef(reg);
    3270             :       // Compute the stage that contains the last definition for instruction.
    3271        4552 :       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
    3272             :       unsigned StageNum = CurStageNum;
    3273        4552 :       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
    3274             :         // Compute the difference in stages between the defintion and the use.
    3275         497 :         unsigned StageDiff = (InstrStageNum - DefStageNum);
    3276             :         // Make an adjustment to get the last definition.
    3277         497 :         StageNum -= StageDiff;
    3278             :       }
    3279        4552 :       if (VRMap[StageNum].count(reg))
    3280        2040 :         MO.setReg(VRMap[StageNum][reg]);
    3281             :     }
    3282             :   }
    3283        3132 : }
    3284             : 
    3285             : /// Return the instruction in the loop that defines the register.
    3286             : /// If the definition is a Phi, then follow the Phi operand to
    3287             : /// the instruction in the loop.
    3288          57 : MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
    3289             :   SmallPtrSet<MachineInstr *, 8> Visited;
    3290          57 :   MachineInstr *Def = MRI.getVRegDef(Reg);
    3291             :   while (Def->isPHI()) {
    3292          28 :     if (!Visited.insert(Def).second)
    3293             :       break;
    3294          56 :     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
    3295         112 :       if (Def->getOperand(i + 1).getMBB() == BB) {
    3296          56 :         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
    3297             :         break;
    3298             :       }
    3299             :   }
    3300          57 :   return Def;
    3301             : }
    3302             : 
    3303             : /// Return the new name for the value from the previous stage.
    3304         438 : unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
    3305             :                                           unsigned LoopVal, unsigned LoopStage,
    3306             :                                           ValueMapTy *VRMap,
    3307             :                                           MachineBasicBlock *BB) {
    3308             :   unsigned PrevVal = 0;
    3309         438 :   if (StageNum > PhiStage) {
    3310          76 :     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
    3311          76 :     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
    3312             :       // The name is defined in the previous stage.
    3313          58 :       PrevVal = VRMap[StageNum - 1][LoopVal];
    3314          18 :     else if (VRMap[StageNum].count(LoopVal))
    3315             :       // The previous name is defined in the current stage when the instruction
    3316             :       // order is swapped.
    3317           7 :       PrevVal = VRMap[StageNum][LoopVal];
    3318          11 :     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
    3319             :       // The loop value hasn't yet been scheduled.
    3320           1 :       PrevVal = LoopVal;
    3321          10 :     else if (StageNum == PhiStage + 1)
    3322             :       // The loop value is another phi, which has not been scheduled.
    3323             :       PrevVal = getInitPhiReg(*LoopInst, BB);
    3324           0 :     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
    3325             :       // The loop value is another phi, which has been scheduled.
    3326           0 :       PrevVal =
    3327           0 :           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
    3328             :                         LoopStage, VRMap, BB);
    3329             :   }
    3330         438 :   return PrevVal;
    3331             : }
    3332             : 
    3333             : /// Rewrite the Phi values in the specified block to use the mappings
    3334             : /// from the initial operand. Once the Phi is scheduled, we switch
    3335             : /// to using the loop value instead of the Phi value, so those names
    3336             : /// do not need to be rewritten.
    3337         177 : void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
    3338             :                                          unsigned StageNum,
    3339             :                                          SMSchedule &Schedule,
    3340             :                                          ValueMapTy *VRMap,
    3341             :                                          InstrMapTy &InstrMap) {
    3342         948 :   for (auto &PHI : BB->phis()) {
    3343             :     unsigned InitVal = 0;
    3344             :     unsigned LoopVal = 0;
    3345         417 :     getPhiRegs(PHI, BB, InitVal, LoopVal);
    3346         417 :     unsigned PhiDef = PHI.getOperand(0).getReg();
    3347             : 
    3348             :     unsigned PhiStage =
    3349         834 :         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
    3350             :     unsigned LoopStage =
    3351         834 :         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
    3352             :     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
    3353         417 :     if (NumPhis > StageNum)
    3354             :       NumPhis = StageNum;
    3355        1293 :     for (unsigned np = 0; np <= NumPhis; ++np) {
    3356             :       unsigned NewVal =
    3357         438 :           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
    3358         438 :       if (!NewVal)
    3359             :         NewVal = InitVal;
    3360         438 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
    3361             :                             PhiDef, NewVal);
    3362             :     }
    3363             :   }
    3364         177 : }
    3365             : 
    3366             : /// Rewrite a previously scheduled instruction to use the register value
    3367             : /// from the new instruction. Make sure the instruction occurs in the
    3368             : /// basic block, and we don't change the uses in the new instruction.
    3369        2249 : void SwingSchedulerDAG::rewriteScheduledInstr(
    3370             :     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
    3371             :     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
    3372             :     unsigned NewReg, unsigned PrevReg) {
    3373        4498 :   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
    3374        2249 :   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
    3375             :   // Rewrite uses that have been scheduled already to use the new
    3376             :   // Phi register.
    3377        2249 :   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
    3378             :                                          EI = MRI.use_end();
    3379        7801 :        UI != EI;) {
    3380             :     MachineOperand &UseOp = *UI;
    3381        5552 :     MachineInstr *UseMI = UseOp.getParent();
    3382             :     ++UI;
    3383        5552 :     if (UseMI->getParent() != BB)
    3384        6458 :       continue;
    3385             :     if (UseMI->isPHI()) {
    3386        1224 :       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
    3387         602 :         continue;
    3388          92 :       if (getLoopPhiReg(*UseMI, BB) != OldReg)
    3389          74 :         continue;
    3390             :     }
    3391        1985 :     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
    3392             :     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
    3393        1985 :     SUnit *OrigMISU = getSUnit(OrigInstr->second);
    3394        1985 :     int StageSched = Schedule.stageScheduled(OrigMISU);
    3395             :     int CycleSched = Schedule.cycleScheduled(OrigMISU);
    3396             :     unsigned ReplaceReg = 0;
    3397             :     // This is the stage for the scheduled instruction.
    3398        1985 :     if (StagePhi == StageSched && Phi->isPHI()) {
    3399             :       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
    3400        1048 :       if (PrevReg && InProlog)
    3401             :         ReplaceReg = PrevReg;
    3402        1048 :       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
    3403           0 :                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
    3404             :         ReplaceReg = PrevReg;
    3405             :       else
    3406             :         ReplaceReg = NewReg;
    3407             :     }
    3408             :     // The scheduled instruction occurs before the scheduled Phi, and the
    3409             :     // Phi is not loop carried.
    3410        2769 :     if (!InProlog && StagePhi + 1 == StageSched &&
    3411         784 :         !Schedule.isLoopCarried(this, *Phi))
    3412             :       ReplaceReg = NewReg;
    3413        1985 :     if (StagePhi > StageSched && Phi->isPHI())
    3414             :       ReplaceReg = NewReg;
    3415        2715 :     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
    3416             :       ReplaceReg = NewReg;
    3417        1985 :     if (ReplaceReg) {
    3418        3596 :       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
    3419        1798 :       UseOp.setReg(ReplaceReg);
    3420             :     }
    3421             :   }
    3422        2249 : }
    3423             : 
    3424             : /// Check if we can change the instruction to use an offset value from the
    3425             : /// previous iteration. If so, return true and set the base and offset values
    3426             : /// so that we can rewrite the load, if necessary.
    3427             : ///   v1 = Phi(v0, v3)
    3428             : ///   v2 = load v1, 0
    3429             : ///   v3 = post_store v1, 4, x
    3430             : /// This function enables the load to be rewritten as v2 = load v3, 4.
    3431        2066 : bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
    3432             :                                               unsigned &BasePos,
    3433             :                                               unsigned &OffsetPos,
    3434             :                                               unsigned &NewBase,
    3435             :                                               int64_t &Offset) {
    3436             :   // Get the load instruction.
    3437        2066 :   if (TII->isPostIncrement(*MI))
    3438             :     return false;
    3439             :   unsigned BasePosLd, OffsetPosLd;
    3440        1871 :   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
    3441             :     return false;
    3442         570 :   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
    3443             : 
    3444             :   // Look for the Phi instruction.
    3445         285 :   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
    3446         285 :   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
    3447         285 :   if (!Phi || !Phi->isPHI())
    3448             :     return false;
    3449             :   // Get the register defined in the loop block.
    3450         178 :   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
    3451         178 :   if (!PrevReg)
    3452             :     return false;
    3453             : 
    3454             :   // Check for the post-increment load/store instruction.
    3455         178 :   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
    3456         178 :   if (!PrevDef || PrevDef == MI)
    3457             :     return false;
    3458             : 
    3459         178 :   if (!TII->isPostIncrement(*PrevDef))
    3460             :     return false;
    3461             : 
    3462          28 :   unsigned BasePos1 = 0, OffsetPos1 = 0;
    3463          28 :   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
    3464             :     return false;
    3465             : 
    3466             :   // Make sure that the instructions do not access the same memory location in
    3467             :   // the next iteration.
    3468          56 :   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
    3469          56 :   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
    3470          28 :   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    3471          56 :   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
    3472          28 :   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
    3473          28 :   MF.DeleteMachineInstr(NewMI);
    3474          28 :   if (!Disjoint)
    3475             :     return false;
    3476             : 
    3477             :   // Set the return value once we determine that we return true.
    3478          28 :   BasePos = BasePosLd;
    3479          28 :   OffsetPos = OffsetPosLd;
    3480          28 :   NewBase = PrevReg;
    3481          28 :   Offset = StoreOffset;
    3482             :   return true;
    3483             : }
    3484             : 
    3485             : /// Apply changes to the instruction if needed. The changes are need
    3486             : /// to improve the scheduling and depend up on the final schedule.
    3487        1688 : void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
    3488             :                                          SMSchedule &Schedule) {
    3489             :   SUnit *SU = getSUnit(MI);
    3490             :   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    3491        1688 :       InstrChanges.find(SU);
    3492        1688 :   if (It != InstrChanges.end()) {
    3493          28 :     std::pair<unsigned, int64_t> RegAndOffset = It->second;
    3494             :     unsigned BasePos, OffsetPos;
    3495          28 :     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
    3496           0 :       return;
    3497          56 :     unsigned BaseReg = MI->getOperand(BasePos).getReg();
    3498          28 :     MachineInstr *LoopDef = findDefInLoop(BaseReg);
    3499          28 :     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
    3500             :     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
    3501          28 :     int BaseStageNum = Schedule.stageScheduled(SU);
    3502             :     int BaseCycleNum = Schedule.cycleScheduled(SU);
    3503          28 :     if (BaseStageNum < DefStageNum) {
    3504          20 :       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    3505          20 :       int OffsetDiff = DefStageNum - BaseStageNum;
    3506          20 :       if (DefCycleNum < BaseCycleNum) {
    3507           6 :         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
    3508           3 :         if (OffsetDiff > 0)
    3509           3 :           --OffsetDiff;
    3510             :       }
    3511             :       int64_t NewOffset =
    3512          40 :           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
    3513          20 :       NewMI->getOperand(OffsetPos).setImm(NewOffset);
    3514             :       SU->setInstr(NewMI);
    3515          40 :       MISUnitMap[NewMI] = SU;
    3516          20 :       NewMIs.insert(NewMI);
    3517             :     }
    3518             :   }
    3519             : }
    3520             : 
    3521             : /// Return true for an order or output dependence that is loop carried
    3522             : /// potentially. A dependence is loop carried if the destination defines a valu
    3523             : /// that may be used or defined by the source in a subsequent iteration.
    3524        4329 : bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
    3525             :                                          bool isSucc) {
    3526        4329 :   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
    3527             :       Dep.isArtificial())
    3528             :     return false;
    3529             : 
    3530         670 :   if (!SwpPruneLoopCarried)
    3531             :     return true;
    3532             : 
    3533         668 :   if (Dep.getKind() == SDep::Output)
    3534             :     return true;
    3535             : 
    3536         668 :   MachineInstr *SI = Source->getInstr();
    3537         668 :   MachineInstr *DI = Dep.getSUnit()->getInstr();
    3538         668 :   if (!isSucc)
    3539             :     std::swap(SI, DI);
    3540             :   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
    3541             : 
    3542             :   // Assume ordered loads and stores may have a loop carried dependence.
    3543        1998 :   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
    3544        1967 :       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
    3545             :     return true;
    3546             : 
    3547             :   // Only chain dependences between a load and store can be loop carried.
    3548         620 :   if (!DI->mayStore() || !SI->mayLoad())
    3549             :     return false;
    3550             : 
    3551             :   unsigned DeltaS, DeltaD;
    3552         514 :   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
    3553             :     return true;
    3554             : 
    3555             :   unsigned BaseRegS, BaseRegD;
    3556             :   int64_t OffsetS, OffsetD;
    3557         348 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3558         696 :   if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
    3559         348 :       !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
    3560             :     return true;
    3561             : 
    3562         348 :   if (BaseRegS != BaseRegD)
    3563             :     return true;
    3564             : 
    3565             :   // Check that the base register is incremented by a constant value for each
    3566             :   // iteration.
    3567         116 :   MachineInstr *Def = MRI.getVRegDef(BaseRegS);
    3568         116 :   if (!Def || !Def->isPHI())
    3569             :     return true;
    3570             :   unsigned InitVal = 0;
    3571             :   unsigned LoopVal = 0;
    3572         116 :   getPhiRegs(*Def, BB, InitVal, LoopVal);
    3573         116 :   MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
    3574         116 :   int D = 0;
    3575         116 :   if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
    3576             :     return true;
    3577             : 
    3578         116 :   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
    3579         116 :   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
    3580             : 
    3581             :   // This is the main test, which checks the offset values and the loop
    3582             :   // increment value to determine if the accesses may be loop carried.
    3583         116 :   if (OffsetS >= OffsetD)
    3584          70 :     return OffsetS + AccessSizeS > DeltaS;
    3585             :   else
    3586          46 :     return OffsetD + AccessSizeD > DeltaD;
    3587             : 
    3588             :   return true;
    3589             : }
    3590             : 
    3591             : void SwingSchedulerDAG::postprocessDAG() {
    3592         579 :   for (auto &M : Mutations)
    3593         386 :     M->apply(this);
    3594             : }
    3595             : 
    3596             : /// Try to schedule the node at the specified StartCycle and continue
    3597             : /// until the node is schedule or the EndCycle is reached.  This function
    3598             : /// returns true if the node is scheduled.  This routine may search either
    3599             : /// forward or backward for a place to insert the instruction based upon
    3600             : /// the relative values of StartCycle and EndCycle.
    3601        3030 : bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
    3602             :   bool forward = true;
    3603        3030 :   if (StartCycle > EndCycle)
    3604             :     forward = false;
    3605             : 
    3606             :   // The terminating condition depends on the direction.
    3607        1562 :   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
    3608        4439 :   for (int curCycle = StartCycle; curCycle != termCycle;
    3609             :        forward ? ++curCycle : --curCycle) {
    3610             : 
    3611             :     // Add the already scheduled instructions at the specified cycle to the DFA.
    3612             :     Resources->clearResources();
    3613        8946 :     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
    3614       13976 :          checkCycle <= LastCycle; checkCycle += II) {
    3615        5030 :       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
    3616             : 
    3617             :       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
    3618             :                                          E = cycleInstrs.end();
    3619       11014 :            I != E; ++I) {
    3620       11968 :         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
    3621        1447 :           continue;
    3622             :         assert(Resources->canReserveResources(*(*I)->getInstr()) &&
    3623             :                "These instructions have already been scheduled.");
    3624        4537 :         Resources->reserveResources(*(*I)->getInstr());
    3625             :       }
    3626             :     }
    3627       11136 :     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
    3628        3304 :         Resources->canReserveResources(*SU->getInstr())) {
    3629             :       LLVM_DEBUG({
    3630             :         dbgs() << "\tinsert at cycle " << curCycle << " ";
    3631             :         SU->getInstr()->dump();
    3632             :       });
    3633             : 
    3634        6036 :       ScheduledInstrs[curCycle].push_back(SU);
    3635        6036 :       InstrToCycle.insert(std::make_pair(SU, curCycle));
    3636        3018 :       if (curCycle > LastCycle)
    3637         527 :         LastCycle = curCycle;
    3638        3018 :       if (curCycle < FirstCycle)
    3639         119 :         FirstCycle = curCycle;
    3640        3018 :       return true;
    3641             :     }
    3642             :     LLVM_DEBUG({
    3643             :       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
    3644             :       SU->getInstr()->dump();
    3645             :     });
    3646             :   }
    3647          12 :   return false;
    3648             : }
    3649             : 
    3650             : // Return the cycle of the earliest scheduled instruction in the chain.
    3651          16 : int SMSchedule::earliestCycleInChain(const SDep &Dep) {
    3652             :   SmallPtrSet<SUnit *, 8> Visited;
    3653             :   SmallVector<SDep, 8> Worklist;
    3654          16 :   Worklist.push_back(Dep);
    3655          16 :   int EarlyCycle = INT_MAX;
    3656         177 :   while (!Worklist.empty()) {
    3657         161 :     const SDep &Cur = Worklist.pop_back_val();
    3658             :     SUnit *PrevSU = Cur.getSUnit();
    3659         161 :     if (Visited.count(PrevSU))
    3660          72 :       continue;
    3661             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
    3662          89 :     if (it == InstrToCycle.end())
    3663           1 :       continue;
    3664         176 :     EarlyCycle = std::min(EarlyCycle, it->second);
    3665         636 :     for (const auto &PI : PrevSU->Preds)
    3666         403 :       if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
    3667         145 :         Worklist.push_back(PI);
    3668          88 :     Visited.insert(PrevSU);
    3669             :   }
    3670          32 :   return EarlyCycle;
    3671             : }
    3672             : 
    3673             : // Return the cycle of the latest scheduled instruction in the chain.
    3674         332 : int SMSchedule::latestCycleInChain(const SDep &Dep) {
    3675             :   SmallPtrSet<SUnit *, 8> Visited;
    3676             :   SmallVector<SDep, 8> Worklist;
    3677         332 :   Worklist.push_back(Dep);
    3678         332 :   int LateCycle = INT_MIN;
    3679        1582 :   while (!Worklist.empty()) {
    3680        1250 :     const SDep &Cur = Worklist.pop_back_val();
    3681             :     SUnit *SuccSU = Cur.getSUnit();
    3682        1250 :     if (Visited.count(SuccSU))
    3683         279 :       continue;
    3684             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
    3685         971 :     if (it == InstrToCycle.end())
    3686         181 :       continue;
    3687        1580 :     LateCycle = std::max(LateCycle, it->second);
    3688        3648 :     for (const auto &SI : SuccSU->Succs)
    3689        1940 :       if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
    3690         918 :         Worklist.push_back(SI);
    3691         790 :     Visited.insert(SuccSU);
    3692             :   }
    3693         664 :   return LateCycle;
    3694             : }
    3695             : 
    3696             : /// If an instruction has a use that spans multiple iterations, then
    3697             : /// return true. These instructions are characterized by having a back-ege
    3698             : /// to a Phi, which contains a reference to another Phi.
    3699       40439 : static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
    3700      204123 :   for (auto &P : SU->Preds)
    3701       89482 :     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
    3702       46368 :       for (auto &S : P.getSUnit()->Succs)
    3703       33238 :         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
    3704             :           return P.getSUnit();
    3705             :   return nullptr;
    3706             : }
    3707             : 
    3708             : /// Compute the scheduling start slot for the instruction.  The start slot
    3709             : /// depends on any predecessor or successor nodes scheduled already.
    3710        3107 : void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
    3711             :                               int *MinEnd, int *MaxStart, int II,
    3712             :                               SwingSchedulerDAG *DAG) {
    3713             :   // Iterate over each instruction that has been scheduled already.  The start
    3714             :   // slot computuation depends on whether the previously scheduled instruction
    3715             :   // is a predecessor or successor of the specified instruction.
    3716       26662 :   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
    3717             : 
    3718             :     // Iterate over each instruction in the current cycle.
    3719       50604 :     for (SUnit *I : getInstructions(cycle)) {
    3720             :       // Because we're processing a DAG for the dependences, we recognize
    3721             :       // the back-edge in recurrences by anti dependences.
    3722       67488 :       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
    3723       40439 :         const SDep &Dep = SU->Preds[i];
    3724       40439 :         if (Dep.getSUnit() == I) {
    3725        1435 :           if (!DAG->isBackedge(SU, Dep)) {
    3726        2788 :             int EarlyStart = cycle + Dep.getLatency() -
    3727        2788 :                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
    3728        1394 :             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
    3729        1394 :             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
    3730          16 :               int End = earliestCycleInChain(Dep) + (II - 1);
    3731          16 :               *MinEnd = std::min(*MinEnd, End);
    3732             :             }
    3733             :           } else {
    3734          82 :             int LateStart = cycle - Dep.getLatency() +
    3735          82 :                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
    3736          41 :             *MinLateStart = std::min(*MinLateStart, LateStart);
    3737             :           }
    3738             :         }
    3739             :         // For instruction that requires multiple iterations, make sure that
    3740             :         // the dependent instruction is not scheduled past the definition.
    3741       40439 :         SUnit *BE = multipleIterations(I, DAG);
    3742       40928 :         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
    3743             :             !SU->isPred(I))
    3744          39 :           *MinLateStart = std::min(*MinLateStart, cycle);
    3745             :       }
    3746       69632 :       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
    3747       85166 :         if (SU->Succs[i].getSUnit() == I) {
    3748             :           const SDep &Dep = SU->Succs[i];
    3749        2830 :           if (!DAG->isBackedge(SU, Dep)) {
    3750        4870 :             int LateStart = cycle - Dep.getLatency() +
    3751        4870 :                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
    3752        2435 :             *MinLateStart = std::min(*MinLateStart, LateStart);
    3753        2435 :             if (DAG->isLoopCarriedDep(SU, Dep)) {
    3754         332 :               int Start = latestCycleInChain(Dep) + 1 - II;
    3755         332 :               *MaxStart = std::max(*MaxStart, Start);
    3756             :             }
    3757             :           } else {
    3758         790 :             int EarlyStart = cycle + Dep.getLatency() -
    3759         790 :                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
    3760         395 :             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
    3761             :           }
    3762             :         }
    3763             :       }
    3764             :     }
    3765             :   }
    3766        3107 : }
    3767             : 
    3768             : /// Order the instructions within a cycle so that the definitions occur
    3769             : /// before the uses. Returns true if the instruction is added to the start
    3770             : /// of the list, or false if added to the end.
    3771        1445 : void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
    3772             :                                  std::deque<SUnit *> &Insts) {
    3773        1445 :   MachineInstr *MI = SU->getInstr();
    3774             :   bool OrderBeforeUse = false;
    3775             :   bool OrderAfterDef = false;
    3776             :   bool OrderBeforeDef = false;
    3777             :   unsigned MoveDef = 0;
    3778             :   unsigned MoveUse = 0;
    3779        1445 :   int StageInst1 = stageScheduled(SU);
    3780             : 
    3781             :   unsigned Pos = 0;
    3782        4443 :   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
    3783             :        ++I, ++Pos) {
    3784        6449 :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    3785        4950 :       MachineOperand &MO = MI->getOperand(i);
    3786        8851 :       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    3787        1095 :         continue;
    3788             : 
    3789             :       unsigned Reg = MO.getReg();
    3790             :       unsigned BasePos, OffsetPos;
    3791        3855 :       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
    3792        1940 :         if (MI->getOperand(BasePos).getReg() == Reg)
    3793         517 :           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
    3794             :             Reg = NewReg;
    3795             :       bool Reads, Writes;
    3796             :       std::tie(Reads, Writes) =
    3797        7710 :           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
    3798        3855 :       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
    3799             :         OrderBeforeUse = true;
    3800          94 :         if (MoveUse == 0)
    3801             :           MoveUse = Pos;
    3802        5149 :       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
    3803             :         // Add the instruction after the scheduled instruction.
    3804             :         OrderAfterDef = true;
    3805             :         MoveDef = Pos;
    3806        3631 :       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
    3807         141 :         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
    3808             :           OrderBeforeUse = true;
    3809           0 :           if (MoveUse == 0)
    3810             :             MoveUse = Pos;
    3811             :         } else {
    3812             :           OrderAfterDef = true;
    3813             :           MoveDef = Pos;
    3814             :         }
    3815        5910 :       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
    3816             :         OrderBeforeUse = true;
    3817          28 :         if (MoveUse == 0)
    3818             :           MoveUse = Pos;
    3819          28 :         if (MoveUse != 0) {
    3820             :           OrderAfterDef = true;
    3821          12 :           MoveDef = Pos - 1;
    3822             :         }
    3823        5854 :       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
    3824             :         // Add the instruction before the scheduled instruction.
    3825             :         OrderBeforeUse = true;
    3826          49 :         if (MoveUse == 0)
    3827             :           MoveUse = Pos;
    3828        7002 :       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
    3829        1246 :                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
    3830         103 :         if (MoveUse == 0) {
    3831             :           OrderBeforeDef = true;
    3832             :           MoveUse = Pos;
    3833             :         }
    3834             :       }
    3835             :     }
    3836             :     // Check for order dependences between instructions. Make sure the source
    3837             :     // is ordered before the destination.
    3838        6300 :     for (auto &S : SU->Succs) {
    3839        1651 :       if (S.getSUnit() != *I)
    3840        1384 :         continue;
    3841         267 :       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
    3842             :         OrderBeforeUse = true;
    3843          13 :         if (Pos < MoveUse)
    3844             :           MoveUse = Pos;
    3845             :       }
    3846             :     }
    3847        5823 :     for (auto &P : SU->Preds) {
    3848        2162 :       if (P.getSUnit() != *I)
    3849        2061 :         continue;
    3850         101 :       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
    3851             :         OrderAfterDef = true;
    3852             :         MoveDef = Pos;
    3853             :       }
    3854             :     }
    3855             :   }
    3856             : 
    3857             :   // A circular dependence.
    3858        1445 :   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
    3859             :     OrderBeforeUse = false;
    3860             : 
    3861             :   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
    3862             :   // to a loop-carried dependence.
    3863        1445 :   if (OrderBeforeDef)
    3864         101 :     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
    3865             : 
    3866             :   // The uncommon case when the instruction order needs to be updated because
    3867             :   // there is both a use and def.
    3868        1445 :   if (OrderBeforeUse && OrderAfterDef) {
    3869          47 :     SUnit *UseSU = Insts.at(MoveUse);
    3870          47 :     SUnit *DefSU = Insts.at(MoveDef);
    3871          47 :     if (MoveUse > MoveDef) {
    3872         129 :       Insts.erase(Insts.begin() + MoveUse);
    3873         129 :       Insts.erase(Insts.begin() + MoveDef);
    3874             :     } else {
    3875          12 :       Insts.erase(Insts.begin() + MoveDef);
    3876          12 :       Insts.erase(Insts.begin() + MoveUse);
    3877             :     }
    3878          47 :     orderDependence(SSD, UseSU, Insts);
    3879          47 :     orderDependence(SSD, SU, Insts);
    3880          47 :     orderDependence(SSD, DefSU, Insts);
    3881          47 :     return;
    3882             :   }
    3883             :   // Put the new instruction first if there is a use in the list. Otherwise,
    3884             :   // put it at the end of the list.
    3885        1398 :   if (OrderBeforeUse)
    3886         216 :     Insts.push_front(SU);
    3887             :   else
    3888        1182 :     Insts.push_back(SU);
    3889             : }
    3890             : 
    3891             : /// Return true if the scheduled Phi has a loop carried operand.
    3892        2251 : bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
    3893             :   if (!Phi.isPHI())
    3894             :     return false;
    3895             :   assert(Phi.isPHI() && "Expecing a Phi.");
    3896             :   SUnit *DefSU = SSD->getSUnit(&Phi);
    3897             :   unsigned DefCycle = cycleScheduled(DefSU);
    3898        1589 :   int DefStage = stageScheduled(DefSU);
    3899             : 
    3900             :   unsigned InitVal = 0;
    3901             :   unsigned LoopVal = 0;
    3902        1589 :   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
    3903        1589 :   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
    3904        1581 :   if (!UseSU)
    3905             :     return true;
    3906        1581 :   if (UseSU->getInstr()->isPHI())
    3907             :     return true;
    3908             :   unsigned LoopCycle = cycleScheduled(UseSU);
    3909        1536 :   int LoopStage = stageScheduled(UseSU);
    3910        1536 :   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
    3911             : }
    3912             : 
    3913             : /// Return true if the instruction is a definition that is loop carried
    3914             : /// and defines the use on the next iteration.
    3915             : ///        v1 = phi(v2, v3)
    3916             : ///  (Def) v3 = op v1
    3917             : ///  (MO)   = v1
    3918             : /// If MO appears before Def, then then v1 and v3 may get assigned to the same
    3919             : /// register.
    3920        1246 : bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
    3921             :                                        MachineInstr *Def, MachineOperand &MO) {
    3922        1246 :   if (!MO.isReg())
    3923             :     return false;
    3924             :   if (Def->isPHI())
    3925             :     return false;
    3926        1246 :   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
    3927        1626 :   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
    3928             :     return false;
    3929         363 :   if (!isLoopCarried(SSD, *Phi))
    3930             :     return false;
    3931         345 :   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
    3932        1173 :   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
    3933         931 :     MachineOperand &DMO = Def->getOperand(i);
    3934        2243 :     if (!DMO.isReg() || !DMO.isDef())
    3935         564 :       continue;
    3936         367 :     if (DMO.getReg() == LoopReg)
    3937             :       return true;
    3938             :   }
    3939             :   return false;
    3940             : }
    3941             : 
    3942             : // Check if the generated schedule is valid. This function checks if
    3943             : // an instruction that uses a physical register is scheduled in a
    3944             : // different stage than the definition. The pipeliner does not handle
    3945             : // physical register values that may cross a basic block boundary.
    3946         178 : bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
    3947        2044 :   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
    3948        1688 :     SUnit &SU = SSD->SUnits[i];
    3949        1688 :     if (!SU.hasPhysRegDefs)
    3950        1688 :       continue;
    3951           0 :     int StageDef = stageScheduled(&SU);
    3952             :     assert(StageDef != -1 && "Instruction should have been scheduled.");
    3953           0 :     for (auto &SI : SU.Succs)
    3954             :       if (SI.isAssignedRegDep())
    3955           0 :         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
    3956           0 :           if (stageScheduled(SI.getSUnit()) != StageDef)
    3957             :             return false;
    3958             :   }
    3959             :   return true;
    3960             : }
    3961             : 
    3962             : /// A property of the node order in swing-modulo-scheduling is
    3963             : /// that for nodes outside circuits the following holds:
    3964             : /// none of them is scheduled after both a successor and a
    3965             : /// predecessor.
    3966             : /// The method below checks whether the property is met.
    3967             : /// If not, debug information is printed and statistics information updated.
    3968             : /// Note that we do not use an assert statement.
    3969             : /// The reason is that although an invalid node oder may prevent
    3970             : /// the pipeliner from finding a pipelined schedule for arbitrary II,
    3971             : /// it does not lead to the generation of incorrect code.
    3972         191 : void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
    3973             : 
    3974             :   // a sorted vector that maps each SUnit to its index in the NodeOrder
    3975             :   typedef std::pair<SUnit *, unsigned> UnitIndex;
    3976         191 :   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
    3977             : 
    3978        2091 :   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
    3979        5700 :     Indices.push_back(std::make_pair(NodeOrder[i], i));
    3980             : 
    3981             :   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
    3982             :     return std::get<0>(i1) < std::get<0>(i2);
    3983             :   };
    3984             : 
    3985             :   // sort, so that we can perform a binary search
    3986             :   llvm::sort(Indices.begin(), Indices.end(), CompareKey);
    3987             : 
    3988             :   bool Valid = true;
    3989             :   (void)Valid;
    3990             :   // for each SUnit in the NodeOrder, check whether
    3991             :   // it appears after both a successor and a predecessor
    3992             :   // of the SUnit. If this is the case, and the SUnit
    3993             :   // is not part of circuit, then the NodeOrder is not
    3994             :   // valid.
    3995        2091 :   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
    3996        3800 :     SUnit *SU = NodeOrder[i];
    3997             :     unsigned Index = i;
    3998             : 
    3999             :     bool PredBefore = false;
    4000             :     bool SuccBefore = false;
    4001             : 
    4002             :     SUnit *Succ;
    4003             :     SUnit *Pred;
    4004             :     (void)Succ;
    4005             :     (void)Pred;
    4006             : 
    4007        5674 :     for (SDep &PredEdge : SU->Preds) {
    4008             :       SUnit *PredSU = PredEdge.getSUnit();
    4009             :       unsigned PredIndex =
    4010             :           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
    4011        2290 :                                         std::make_pair(PredSU, 0), CompareKey));
    4012        3648 :       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
    4013             :         PredBefore = true;
    4014             :         Pred = PredSU;
    4015             :         break;
    4016             :       }
    4017             :     }
    4018             : 
    4019        3252 :     for (SDep &SuccEdge : SU->Succs) {
    4020             :       SUnit *SuccSU = SuccEdge.getSUnit();
    4021             :       unsigned SuccIndex =
    4022             :           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
    4023        1784 :                                         std::make_pair(SuccSU, 0), CompareKey));
    4024        3558 :       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
    4025             :         SuccBefore = true;
    4026             :         Succ = SuccSU;
    4027             :         break;
    4028             :       }
    4029             :     }
    4030             : 
    4031        1900 :     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
    4032             :       // instructions in circuits are allowed to be scheduled
    4033             :       // after both a successor and predecessor.
    4034             :       bool InCircuit = std::any_of(
    4035             :           Circuits.begin(), Circuits.end(),
    4036             :           [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
    4037             :       if (InCircuit)
    4038             :         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
    4039             :       else {
    4040             :         Valid = false;
    4041             :         NumNodeOrderIssues++;
    4042             :         LLVM_DEBUG(dbgs() << "Predecessor ";);
    4043             :       }
    4044             :       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
    4045             :                         << " are scheduled before node " << SU->NodeNum
    4046             :                         << "\n";);
    4047             :     }
    4048             :   }
    4049             : 
    4050             :   LLVM_DEBUG({
    4051             :     if (!Valid)
    4052             :       dbgs() << "Invalid node order found!\n";
    4053             :   });
    4054         191 : }
    4055             : 
    4056             : /// Attempt to fix the degenerate cases when the instruction serialization
    4057             : /// causes the register lifetimes to overlap. For example,
    4058             : ///   p' = store_pi(p, b)
    4059             : ///      = load p, offset
    4060             : /// In this case p and p' overlap, which means that two registers are needed.
    4061             : /// Instead, this function changes the load to use p' and updates the offset.
    4062         517 : void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
    4063             :   unsigned OverlapReg = 0;
    4064             :   unsigned NewBaseReg = 0;
    4065        2205 :   for (SUnit *SU : Instrs) {
    4066        1688 :     MachineInstr *MI = SU->getInstr();
    4067        6989 :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    4068        5575 :       const MachineOperand &MO = MI->getOperand(i);
    4069             :       // Look for an instruction that uses p. The instruction occurs in the
    4070             :       // same cycle but occurs later in the serialized order.
    4071        9699 :       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
    4072             :         // Check that the instruction appears in the InstrChanges structure,
    4073             :         // which contains instructions that can have the offset updated.
    4074             :         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    4075           2 :           InstrChanges.find(SU);
    4076           2 :         if (It != InstrChanges.end()) {
    4077             :           unsigned BasePos, OffsetPos;
    4078             :           // Update the base register and adjust the offset.
    4079           2 :           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
    4080           2 :             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    4081           4 :             NewMI->getOperand(BasePos).setReg(NewBaseReg);
    4082             :             int64_t NewOffset =
    4083           4 :                 MI->getOperand(OffsetPos).getImm() - It->second.second;
    4084           2 :             NewMI->getOperand(OffsetPos).setImm(NewOffset);
    4085             :             SU->setInstr(NewMI);
    4086           4 :             MISUnitMap[NewMI] = SU;
    4087           2 :             NewMIs.insert(NewMI);
    4088             :           }
    4089             :         }
    4090             :         OverlapReg = 0;
    4091             :         NewBaseReg = 0;
    4092             :         break;
    4093             :       }
    4094             :       // Look for an instruction of the form p' = op(p), which uses and defines
    4095             :       // two virtual registers that get allocated to the same physical register.
    4096        5573 :       unsigned TiedUseIdx = 0;
    4097        5573 :       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
    4098             :         // OverlapReg is p in the example above.
    4099         544 :         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
    4100             :         // NewBaseReg is p' in the example above.
    4101         272 :         NewBaseReg = MI->getOperand(i).getReg();
    4102         272 :         break;
    4103             :       }
    4104             :     }
    4105             :   }
    4106         517 : }
    4107             : 
    4108             : /// After the schedule has been formed, call this function to combine
    4109             : /// the instructions from the different stages/cycles.  That is, this
    4110             : /// function creates a schedule that represents a single iteration.
    4111         178 : void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
    4112             :   // Move all instructions to the first stage from later stages.
    4113        1390 :   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
    4114        2050 :     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
    4115             :          ++stage) {
    4116             :       std::deque<SUnit *> &cycleInstrs =
    4117        1016 :           ScheduledInstrs[cycle + (stage * InitiationInterval)];
    4118             :       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
    4119             :                                                  E = cycleInstrs.rend();
    4120        1171 :            I != E; ++I)
    4121         663 :         ScheduledInstrs[cycle].push_front(*I);
    4122             :     }
    4123             :   }
    4124             :   // Iterate over the definitions in each instruction, and compute the
    4125             :   // stage difference for each use.  Keep the maximum value.
    4126        1866 :   for (auto &I : InstrToCycle) {
    4127        1688 :     int DefStage = stageScheduled(I.first);
    4128        1688 :     MachineInstr *MI = I.first->getInstr();
    4129        8013 :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    4130        6325 :       MachineOperand &Op = MI->getOperand(i);
    4131       11003 :       if (!Op.isReg() || !Op.isDef())
    4132        4630 :         continue;
    4133             : 
    4134        1695 :       unsigned Reg = Op.getReg();
    4135        1695 :       unsigned MaxDiff = 0;
    4136             :       bool PhiIsSwapped = false;
    4137        1695 :       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
    4138             :                                              EI = MRI.use_end();
    4139        3939 :            UI != EI; ++UI) {
    4140             :         MachineOperand &UseOp = *UI;
    4141        2244 :         MachineInstr *UseMI = UseOp.getParent();
    4142             :         SUnit *SUnitUse = SSD->getSUnit(UseMI);
    4143        2244 :         int UseStage = stageScheduled(SUnitUse);
    4144        2244 :         unsigned Diff = 0;
    4145        2244 :         if (UseStage != -1 && UseStage >= DefStage)
    4146        2140 :           Diff = UseStage - DefStage;
    4147             :         if (MI->isPHI()) {
    4148         673 :           if (isLoopCarried(SSD, *MI))
    4149         648 :             ++Diff;
    4150             :           else
    4151             :             PhiIsSwapped = true;
    4152             :         }
    4153        2244 :         MaxDiff = std::max(Diff, MaxDiff);
    4154             :       }
    4155        1695 :       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
    4156             :     }
    4157             :   }
    4158             : 
    4159             :   // Erase all the elements in the later stages. Only one iteration should
    4160             :   // remain in the scheduled list, and it contains all the instructions.
    4161         798 :   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
    4162         442 :     ScheduledInstrs.erase(cycle);
    4163             : 
    4164             :   // Change the registers in instruction as specified in the InstrChanges
    4165             :   // map. We need to use the new registers to create the correct order.
    4166        2044 :   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
    4167        1688 :     SUnit *SU = &SSD->SUnits[i];
    4168        1688 :     SSD->applyInstrChange(SU->getInstr(), *this);
    4169             :   }
    4170             : 
    4171             :   // Reorder the instructions in each cycle to fix and improve the
    4172             :   // generated code.
    4173        1390 :   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
    4174         517 :     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
    4175             :     std::deque<SUnit *> newOrderPhi;
    4176        2205 :     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
    4177        1688 :       SUnit *SU = cycleInstrs[i];
    4178        1688 :       if (SU->getInstr()->isPHI())
    4179         384 :         newOrderPhi.push_back(SU);
    4180             :     }
    4181             :     std::deque<SUnit *> newOrderI;
    4182        2205 :     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
    4183        1688 :       SUnit *SU = cycleInstrs[i];
    4184        1688 :       if (!SU->getInstr()->isPHI())
    4185        1304 :         orderDependence(SSD, SU, newOrderI);
    4186             :     }
    4187             :     // Replace the old order with the new order.
    4188             :     cycleInstrs.swap(newOrderPhi);
    4189         517 :     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
    4190         517 :     SSD->fixupRegisterOverlaps(cycleInstrs);
    4191             :   }
    4192             : 
    4193             :   LLVM_DEBUG(dump(););
    4194      303685 : }
    4195             : 
    4196             : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    4197             : /// Print the schedule information to the given output.
    4198             : void SMSchedule::print(raw_ostream &os) const {
    4199             :   // Iterate over each cycle.
    4200             :   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
    4201             :     // Iterate over each instruction in the cycle.
    4202             :     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
    4203             :     for (SUnit *CI : cycleInstrs->second) {
    4204             :       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
    4205             :       os << "(" << CI->NodeNum << ") ";
    4206             :       CI->getInstr()->print(os);
    4207             :       os << "\n";
    4208             :     }
    4209             :   }
    4210             : }
    4211             : 
    4212             : /// Utility function used for debugging to print the schedule.
    4213             : LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
    4214             : #endif

Generated by: LCOV version 1.13