LCOV - llvm-toolchain.info - lib/CodeGen/MachinePipeliner.cpp

LCOV - code coverage report

Current view:	top level - lib/CodeGen - MachinePipeliner.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	1502	1724	87.1 %
Date:	2018-10-20 13:21:21	Functions:	79	110	71.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
      11             : //
      12             : // Software pipelining (SWP) is an instruction scheduling technique for loops
      13             : // that overlap loop iterations and exploits ILP via a compiler transformation.
      14             : //
      15             : // Swing Modulo Scheduling is an implementation of software pipelining
      16             : // that generates schedules that are near optimal in terms of initiation
      17             : // interval, register requirements, and stage count. See the papers:
      18             : //
      19             : // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
      20             : // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
      21             : // Conference on Parallel Architectures and Compilation Techiniques.
      22             : //
      23             : // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
      24             : // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
      25             : // Transactions on Computers, Vol. 50, No. 3, 2001.
      26             : //
      27             : // "An Implementation of Swing Modulo Scheduling With Extensions for
      28             : // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
      29             : // Urbana-Chambpain, 2005.
      30             : //
      31             : //
      32             : // The SMS algorithm consists of three main steps after computing the minimal
      33             : // initiation interval (MII).
      34             : // 1) Analyze the dependence graph and compute information about each
      35             : //    instruction in the graph.
      36             : // 2) Order the nodes (instructions) by priority based upon the heuristics
      37             : //    described in the algorithm.
      38             : // 3) Attempt to schedule the nodes in the specified order using the MII.
      39             : //
      40             : // This SMS implementation is a target-independent back-end pass. When enabled,
      41             : // the pass runs just prior to the register allocation pass, while the machine
      42             : // IR is in SSA form. If software pipelining is successful, then the original
      43             : // loop is replaced by the optimized loop. The optimized loop contains one or
      44             : // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
      45             : // the instructions cannot be scheduled in a given MII, we increase the MII by
      46             : // one and try again.
      47             : //
      48             : // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
      49             : // represent loop carried dependences in the DAG as order edges to the Phi
      50             : // nodes. We also perform several passes over the DAG to eliminate unnecessary
      51             : // edges that inhibit the ability to pipeline. The implementation uses the
      52             : // DFAPacketizer class to compute the minimum initiation interval and the check
      53             : // where an instruction may be inserted in the pipelined schedule.
      54             : //
      55             : // In order for the SMS pass to work, several target specific hooks need to be
      56             : // implemented to get information about the loop structure and to rewrite
      57             : // instructions.
      58             : //
      59             : //===----------------------------------------------------------------------===//
      60             : 
      61             : #include "llvm/ADT/ArrayRef.h"
      62             : #include "llvm/ADT/BitVector.h"
      63             : #include "llvm/ADT/DenseMap.h"
      64             : #include "llvm/ADT/MapVector.h"
      65             : #include "llvm/ADT/PriorityQueue.h"
      66             : #include "llvm/ADT/SetVector.h"
      67             : #include "llvm/ADT/SmallPtrSet.h"
      68             : #include "llvm/ADT/SmallSet.h"
      69             : #include "llvm/ADT/SmallVector.h"
      70             : #include "llvm/ADT/Statistic.h"
      71             : #include "llvm/ADT/iterator_range.h"
      72             : #include "llvm/Analysis/AliasAnalysis.h"
      73             : #include "llvm/Analysis/MemoryLocation.h"
      74             : #include "llvm/Analysis/ValueTracking.h"
      75             : #include "llvm/CodeGen/DFAPacketizer.h"
      76             : #include "llvm/CodeGen/LiveIntervals.h"
      77             : #include "llvm/CodeGen/MachineBasicBlock.h"
      78             : #include "llvm/CodeGen/MachineDominators.h"
      79             : #include "llvm/CodeGen/MachineFunction.h"
      80             : #include "llvm/CodeGen/MachineFunctionPass.h"
      81             : #include "llvm/CodeGen/MachineInstr.h"
      82             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      83             : #include "llvm/CodeGen/MachineLoopInfo.h"
      84             : #include "llvm/CodeGen/MachineMemOperand.h"
      85             : #include "llvm/CodeGen/MachineOperand.h"
      86             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      87             : #include "llvm/CodeGen/RegisterClassInfo.h"
      88             : #include "llvm/CodeGen/RegisterPressure.h"
      89             : #include "llvm/CodeGen/ScheduleDAG.h"
      90             : #include "llvm/CodeGen/ScheduleDAGInstrs.h"
      91             : #include "llvm/CodeGen/ScheduleDAGMutation.h"
      92             : #include "llvm/CodeGen/TargetInstrInfo.h"
      93             : #include "llvm/CodeGen/TargetOpcodes.h"
      94             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      95             : #include "llvm/CodeGen/TargetSubtargetInfo.h"
      96             : #include "llvm/Config/llvm-config.h"
      97             : #include "llvm/IR/Attributes.h"
      98             : #include "llvm/IR/DebugLoc.h"
      99             : #include "llvm/IR/Function.h"
     100             : #include "llvm/MC/LaneBitmask.h"
     101             : #include "llvm/MC/MCInstrDesc.h"
     102             : #include "llvm/MC/MCInstrItineraries.h"
     103             : #include "llvm/MC/MCRegisterInfo.h"
     104             : #include "llvm/Pass.h"
     105             : #include "llvm/Support/Casting.h"
     106             : #include "llvm/Support/CommandLine.h"
     107             : #include "llvm/Support/Compiler.h"
     108             : #include "llvm/Support/Debug.h"
     109             : #include "llvm/Support/MathExtras.h"
     110             : #include "llvm/Support/raw_ostream.h"
     111             : #include <algorithm>
     112             : #include <cassert>
     113             : #include <climits>
     114             : #include <cstdint>
     115             : #include <deque>
     116             : #include <functional>
     117             : #include <iterator>
     118             : #include <map>
     119             : #include <memory>
     120             : #include <tuple>
     121             : #include <utility>
     122             : #include <vector>
     123             : 
     124             : using namespace llvm;
     125             : 
     126             : #define DEBUG_TYPE "pipeliner"
     127             : 
     128             : STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
     129             : STATISTIC(NumPipelined, "Number of loops software pipelined");
     130             : STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
     131             : 
     132             : /// A command line option to turn software pipelining on or off.
     133             : static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
     134             :                                cl::ZeroOrMore,
     135             :                                cl::desc("Enable Software Pipelining"));
     136             : 
     137             : /// A command line option to enable SWP at -Os.
     138             : static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
     139             :                                       cl::desc("Enable SWP at Os."), cl::Hidden,
     140             :                                       cl::init(false));
     141             : 
     142             : /// A command line argument to limit minimum initial interval for pipelining.
     143             : static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
     144             :                               cl::desc("Size limit for the MII."),
     145             :                               cl::Hidden, cl::init(27));
     146             : 
     147             : /// A command line argument to limit the number of stages in the pipeline.
     148             : static cl::opt<int>
     149             :     SwpMaxStages("pipeliner-max-stages",
     150             :                  cl::desc("Maximum stages allowed in the generated scheduled."),
     151             :                  cl::Hidden, cl::init(3));
     152             : 
     153             : /// A command line option to disable the pruning of chain dependences due to
     154             : /// an unrelated Phi.
     155             : static cl::opt<bool>
     156             :     SwpPruneDeps("pipeliner-prune-deps",
     157             :                  cl::desc("Prune dependences between unrelated Phi nodes."),
     158             :                  cl::Hidden, cl::init(true));
     159             : 
     160             : /// A command line option to disable the pruning of loop carried order
     161             : /// dependences.
     162             : static cl::opt<bool>
     163             :     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
     164             :                         cl::desc("Prune loop carried order dependences."),
     165             :                         cl::Hidden, cl::init(true));
     166             : 
     167             : #ifndef NDEBUG
     168             : static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
     169             : #endif
     170             : 
     171             : static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
     172             :                                      cl::ReallyHidden, cl::init(false),
     173             :                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
     174             : 
     175             : // A command line option to enable the CopyToPhi DAG mutation.
     176             : static cl::opt<bool>
     177             :     SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
     178             :                        cl::init(true), cl::ZeroOrMore,
     179             :                        cl::desc("Enable CopyToPhi DAG Mutation"));
     180             : 
     181             : namespace {
     182             : 
     183             : class NodeSet;
     184             : class SMSchedule;
     185             : 
     186             : /// The main class in the implementation of the target independent
     187             : /// software pipeliner pass.
     188             : class MachinePipeliner : public MachineFunctionPass {
     189             : public:
     190             :   MachineFunction *MF = nullptr;
     191             :   const MachineLoopInfo *MLI = nullptr;
     192             :   const MachineDominatorTree *MDT = nullptr;
     193             :   const InstrItineraryData *InstrItins;
     194             :   const TargetInstrInfo *TII = nullptr;
     195             :   RegisterClassInfo RegClassInfo;
     196             : 
     197             : #ifndef NDEBUG
     198             :   static int NumTries;
     199             : #endif
     200             : 
     201             :   /// Cache the target analysis information about the loop.
     202         843 :   struct LoopInfo {
     203             :     MachineBasicBlock *TBB = nullptr;
     204             :     MachineBasicBlock *FBB = nullptr;
     205             :     SmallVector<MachineOperand, 4> BrCond;
     206             :     MachineInstr *LoopInductionVar = nullptr;
     207             :     MachineInstr *LoopCompare = nullptr;
     208             :   };
     209             :   LoopInfo LI;
     210             : 
     211             :   static char ID;
     212             : 
     213         843 :   MachinePipeliner() : MachineFunctionPass(ID) {
     214         843 :     initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
     215         843 :   }
     216             : 
     217             :   bool runOnMachineFunction(MachineFunction &MF) override;
     218             : 
     219         836 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     220             :     AU.addRequired<AAResultsWrapperPass>();
     221             :     AU.addPreserved<AAResultsWrapperPass>();
     222             :     AU.addRequired<MachineLoopInfo>();
     223             :     AU.addRequired<MachineDominatorTree>();
     224             :     AU.addRequired<LiveIntervals>();
     225         836 :     MachineFunctionPass::getAnalysisUsage(AU);
     226         836 :   }
     227             : 
     228             : private:
     229             :   void preprocessPhiNodes(MachineBasicBlock &B);
     230             :   bool canPipelineLoop(MachineLoop &L);
     231             :   bool scheduleLoop(MachineLoop &L);
     232             :   bool swingModuloScheduler(MachineLoop &L);
     233             : };
     234             : 
     235             : /// This class builds the dependence graph for the instructions in a loop,
     236             : /// and attempts to schedule the instructions using the SMS algorithm.
     237             : class SwingSchedulerDAG : public ScheduleDAGInstrs {
     238             :   MachinePipeliner &Pass;
     239             :   /// The minimum initiation interval between iterations for this schedule.
     240             :   unsigned MII = 0;
     241             :   /// Set to true if a valid pipelined schedule is found for the loop.
     242             :   bool Scheduled = false;
     243             :   MachineLoop &Loop;
     244             :   LiveIntervals &LIS;
     245             :   const RegisterClassInfo &RegClassInfo;
     246             : 
     247             :   /// A toplogical ordering of the SUnits, which is needed for changing
     248             :   /// dependences and iterating over the SUnits.
     249             :   ScheduleDAGTopologicalSort Topo;
     250             : 
     251             :   struct NodeInfo {
     252             :     int ASAP = 0;
     253             :     int ALAP = 0;
     254             :     int ZeroLatencyDepth = 0;
     255             :     int ZeroLatencyHeight = 0;
     256             : 
     257             :     NodeInfo() = default;
     258             :   };
     259             :   /// Computed properties for each node in the graph.
     260             :   std::vector<NodeInfo> ScheduleInfo;
     261             : 
     262             :   enum OrderKind { BottomUp = 0, TopDown = 1 };
     263             :   /// Computed node ordering for scheduling.
     264             :   SetVector<SUnit *> NodeOrder;
     265             : 
     266             :   using NodeSetType = SmallVector<NodeSet, 8>;
     267             :   using ValueMapTy = DenseMap<unsigned, unsigned>;
     268             :   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
     269             :   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
     270             : 
     271             :   /// Instructions to change when emitting the final schedule.
     272             :   DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
     273             : 
     274             :   /// We may create a new instruction, so remember it because it
     275             :   /// must be deleted when the pass is finished.
     276             :   SmallPtrSet<MachineInstr *, 4> NewMIs;
     277             : 
     278             :   /// Ordered list of DAG postprocessing steps.
     279             :   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
     280             : 
     281             :   /// Helper class to implement Johnson's circuit finding algorithm.
     282             :   class Circuits {
     283             :     std::vector<SUnit> &SUnits;
     284             :     SetVector<SUnit *> Stack;
     285             :     BitVector Blocked;
     286             :     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
     287             :     SmallVector<SmallVector<int, 4>, 16> AdjK;
     288             :     // Node to Index from ScheduleDAGTopologicalSort
     289             :     std::vector<int> *Node2Idx;
     290             :     unsigned NumPaths;
     291             :     static unsigned MaxPaths;
     292             : 
     293             :   public:
     294         190 :     Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
     295         570 :         : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
     296         380 :       Node2Idx = new std::vector<int>(SUs.size());
     297             :       unsigned Idx = 0;
     298        2201 :       for (const auto &NodeNum : Topo)
     299        2011 :         Node2Idx->at(NodeNum) = Idx++;
     300         190 :     }
     301             : 
     302         380 :     ~Circuits() { delete Node2Idx; }
     303             : 
     304             :     /// Reset the data structures used in the circuit algorithm.
     305        2011 :     void reset() {
     306             :       Stack.clear();
     307             :       Blocked.reset();
     308        6033 :       B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
     309        2011 :       NumPaths = 0;
     310        2011 :     }
     311             : 
     312             :     void createAdjacencyStructure(SwingSchedulerDAG *DAG);
     313             :     bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
     314             :     void unblock(int U);
     315             :   };
     316             : 
     317         190 :   struct CopyToPhiMutation : public ScheduleDAGMutation {
     318             :     void apply(ScheduleDAGInstrs *DAG) override;
     319             :   };
     320             : 
     321             : public:
     322         190 :   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
     323             :                     const RegisterClassInfo &rci)
     324         380 :       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
     325         380 :         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
     326         190 :     P.MF->getSubtarget().getSMSMutations(Mutations);
     327         190 :     if (SwpEnableCopyToPhi)
     328         190 :       Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
     329         190 :   }
     330             : 
     331             :   void schedule() override;
     332             :   void finishBlock() override;
     333             : 
     334             :   /// Return true if the loop kernel has been scheduled.
     335           0 :   bool hasNewSchedule() { return Scheduled; }
     336             : 
     337             :   /// Return the earliest time an instruction may be scheduled.
     338       10092 :   int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
     339             : 
     340             :   /// Return the latest time an instruction my be scheduled.
     341        8948 :   int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
     342             : 
     343             :   /// The mobility function, which the number of slots in which
     344             :   /// an instruction may be scheduled.
     345        4582 :   int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
     346             : 
     347             :   /// The depth, in the dependence graph, for a node.
     348           0 :   unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
     349             : 
     350             :   /// The maximum unweighted length of a path from an arbitrary node to the
     351             :   /// given node in which each edge has latency 0
     352           0 :   int getZeroLatencyDepth(SUnit *Node) {
     353        8440 :     return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
     354             :   }
     355             : 
     356             :   /// The height, in the dependence graph, for a node.
     357           0 :   unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
     358             : 
     359             :   /// The maximum unweighted length of a path from the given node to an
     360             :   /// arbitrary node in which each edge has latency 0
     361           0 :   int getZeroLatencyHeight(SUnit *Node) {
     362        5940 :     return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
     363             :   }
     364             : 
     365             :   /// Return true if the dependence is a back-edge in the data dependence graph.
     366             :   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
     367             :   /// using an anti dependence from a Phi to an instruction.
     368           0 :   bool isBackedge(SUnit *Source, const SDep &Dep) {
     369           0 :     if (Dep.getKind() != SDep::Anti)
     370           0 :       return false;
     371           0 :     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
     372             :   }
     373             : 
     374             :   bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
     375             : 
     376             :   /// The distance function, which indicates that operation V of iteration I
     377             :   /// depends on operations U of iteration I-distance.
     378           0 :   unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
     379             :     // Instructions that feed a Phi have a distance of 1. Computing larger
     380             :     // values for arrays requires data dependence information.
     381         436 :     if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
     382           0 :       return 1;
     383             :     return 0;
     384             :   }
     385             : 
     386             :   /// Set the Minimum Initiation Interval for this schedule attempt.
     387             :   void setMII(unsigned mii) { MII = mii; }
     388             : 
     389             :   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
     390             : 
     391             :   void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
     392             : 
     393             :   /// Return the new base register that was stored away for the changed
     394             :   /// instruction.
     395             :   unsigned getInstrBaseReg(SUnit *SU) {
     396             :     DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
     397         434 :         InstrChanges.find(SU);
     398         434 :     if (It != InstrChanges.end())
     399          53 :       return It->second.first;
     400             :     return 0;
     401             :   }
     402             : 
     403             :   void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
     404             :     Mutations.push_back(std::move(Mutation));
     405             :   }
     406             : 
     407             :   static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
     408             : 
     409             : private:
     410             :   void addLoopCarriedDependences(AliasAnalysis *AA);
     411             :   void updatePhiDependences();
     412             :   void changeDependences();
     413             :   unsigned calculateResMII();
     414             :   unsigned calculateRecMII(NodeSetType &RecNodeSets);
     415             :   void findCircuits(NodeSetType &NodeSets);
     416             :   void fuseRecs(NodeSetType &NodeSets);
     417             :   void removeDuplicateNodes(NodeSetType &NodeSets);
     418             :   void computeNodeFunctions(NodeSetType &NodeSets);
     419             :   void registerPressureFilter(NodeSetType &NodeSets);
     420             :   void colocateNodeSets(NodeSetType &NodeSets);
     421             :   void checkNodeSets(NodeSetType &NodeSets);
     422             :   void groupRemainingNodes(NodeSetType &NodeSets);
     423             :   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
     424             :                          SetVector<SUnit *> &NodesAdded);
     425             :   void computeNodeOrder(NodeSetType &NodeSets);
     426             :   void checkValidNodeOrder(const NodeSetType &Circuits) const;
     427             :   bool schedulePipeline(SMSchedule &Schedule);
     428             :   void generatePipelinedLoop(SMSchedule &Schedule);
     429             :   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
     430             :                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
     431             :                       MBBVectorTy &PrologBBs);
     432             :   void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
     433             :                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
     434             :                       MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
     435             :   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
     436             :                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
     437             :                             SMSchedule &Schedule, ValueMapTy *VRMap,
     438             :                             InstrMapTy &InstrMap, unsigned LastStageNum,
     439             :                             unsigned CurStageNum, bool IsLast);
     440             :   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
     441             :                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
     442             :                     SMSchedule &Schedule, ValueMapTy *VRMap,
     443             :                     InstrMapTy &InstrMap, unsigned LastStageNum,
     444             :                     unsigned CurStageNum, bool IsLast);
     445             :   void removeDeadInstructions(MachineBasicBlock *KernelBB,
     446             :                               MBBVectorTy &EpilogBBs);
     447             :   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
     448             :                       SMSchedule &Schedule);
     449             :   void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
     450             :                    MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
     451             :                    ValueMapTy *VRMap);
     452             :   bool computeDelta(MachineInstr &MI, unsigned &Delta);
     453             :   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
     454             :                          unsigned Num);
     455             :   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
     456             :                            unsigned InstStageNum);
     457             :   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
     458             :                                     unsigned InstStageNum,
     459             :                                     SMSchedule &Schedule);
     460             :   void updateInstruction(MachineInstr *NewMI, bool LastDef,
     461             :                          unsigned CurStageNum, unsigned InstrStageNum,
     462             :                          SMSchedule &Schedule, ValueMapTy *VRMap);
     463             :   MachineInstr *findDefInLoop(unsigned Reg);
     464             :   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
     465             :                          unsigned LoopStage, ValueMapTy *VRMap,
     466             :                          MachineBasicBlock *BB);
     467             :   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
     468             :                         SMSchedule &Schedule, ValueMapTy *VRMap,
     469             :                         InstrMapTy &InstrMap);
     470             :   void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
     471             :                              InstrMapTy &InstrMap, unsigned CurStageNum,
     472             :                              unsigned PhiNum, MachineInstr *Phi,
     473             :                              unsigned OldReg, unsigned NewReg,
     474             :                              unsigned PrevReg = 0);
     475             :   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
     476             :                              unsigned &OffsetPos, unsigned &NewBase,
     477             :                              int64_t &NewOffset);
     478             :   void postprocessDAG();
     479             : };
     480             : 
     481             : /// A NodeSet contains a set of SUnit DAG nodes with additional information
     482             : /// that assigns a priority to the set.
     483        3324 : class NodeSet {
     484             :   SetVector<SUnit *> Nodes;
     485             :   bool HasRecurrence = false;
     486             :   unsigned RecMII = 0;
     487             :   int MaxMOV = 0;
     488             :   unsigned MaxDepth = 0;
     489             :   unsigned Colocate = 0;
     490             :   SUnit *ExceedPressure = nullptr;
     491             :   unsigned Latency = 0;
     492             : 
     493             : public:
     494             :   using iterator = SetVector<SUnit *>::const_iterator;
     495             : 
     496         188 :   NodeSet() = default;
     497         578 :   NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
     498             :     Latency = 0;
     499        2492 :     for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
     500        6963 :       for (const SDep &Succ : Nodes[i]->Succs)
     501        3135 :         if (Nodes.count(Succ.getSUnit()))
     502        1946 :           Latency += Succ.getLatency();
     503         578 :   }
     504             : 
     505         545 :   bool insert(SUnit *SU) { return Nodes.insert(SU); }
     506             : 
     507          83 :   void insert(iterator S, iterator E) { Nodes.insert(S, E); }
     508             : 
     509             :   template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
     510           0 :     return Nodes.remove_if(P);
     511             :   }
     512             : 
     513             :   unsigned count(SUnit *SU) const { return Nodes.count(SU); }
     514             : 
     515             :   bool hasRecurrence() { return HasRecurrence; };
     516             : 
     517         420 :   unsigned size() const { return Nodes.size(); }
     518             : 
     519             :   bool empty() const { return Nodes.empty(); }
     520             : 
     521           1 :   SUnit *getNode(unsigned i) const { return Nodes[i]; };
     522             : 
     523           0 :   void setRecMII(unsigned mii) { RecMII = mii; };
     524             : 
     525           0 :   void setColocate(unsigned c) { Colocate = c; };
     526             : 
     527           1 :   void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
     528             : 
     529           0 :   bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
     530             : 
     531           0 :   int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
     532             : 
     533           0 :   int getRecMII() { return RecMII; }
     534             : 
     535             :   /// Summarize node functions for the entire node set.
     536         420 :   void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
     537        1656 :     for (SUnit *SU : *this) {
     538        1519 :       MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
     539        1890 :       MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
     540             :     }
     541         420 :   }
     542             : 
     543           0 :   unsigned getLatency() { return Latency; }
     544             : 
     545           0 :   unsigned getMaxDepth() { return MaxDepth; }
     546             : 
     547             :   void clear() {
     548             :     Nodes.clear();
     549         197 :     RecMII = 0;
     550         197 :     HasRecurrence = false;
     551         197 :     MaxMOV = 0;
     552         197 :     MaxDepth = 0;
     553         197 :     Colocate = 0;
     554         197 :     ExceedPressure = nullptr;
     555             :   }
     556             : 
     557         420 :   operator SetVector<SUnit *> &() { return Nodes; }
     558             : 
     559             :   /// Sort the node sets by importance. First, rank them by recurrence MII,
     560             :   /// then by mobility (least mobile done first), and finally by depth.
     561             :   /// Each node set may contain a colocate value which is used as the first
     562             :   /// tie breaker, if it's set.
     563             :   bool operator>(const NodeSet &RHS) const {
     564           0 :     if (RecMII == RHS.RecMII) {
     565           0 :       if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
     566           0 :         return Colocate < RHS.Colocate;
     567           0 :       if (MaxMOV == RHS.MaxMOV)
     568           0 :         return MaxDepth > RHS.MaxDepth;
     569           0 :       return MaxMOV < RHS.MaxMOV;
     570             :     }
     571           0 :     return RecMII > RHS.RecMII;
     572             :   }
     573             : 
     574             :   bool operator==(const NodeSet &RHS) const {
     575             :     return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
     576             :            MaxDepth == RHS.MaxDepth;
     577             :   }
     578             : 
     579             :   bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
     580             : 
     581             :   iterator begin() { return Nodes.begin(); }
     582             :   iterator end() { return Nodes.end(); }
     583             : 
     584             :   void print(raw_ostream &os) const {
     585             :     os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
     586             :        << " depth " << MaxDepth << " col " << Colocate << "\n";
     587             :     for (const auto &I : Nodes)
     588             :       os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
     589             :     os << "\n";
     590             :   }
     591             : 
     592             : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     593             :   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
     594             : #endif
     595             : };
     596             : 
     597             : /// This class represents the scheduled code.  The main data structure is a
     598             : /// map from scheduled cycle to instructions.  During scheduling, the
     599             : /// data structure explicitly represents all stages/iterations.   When
     600             : /// the algorithm finshes, the schedule is collapsed into a single stage,
     601             : /// which represents instructions from different loop iterations.
     602             : ///
     603             : /// The SMS algorithm allows negative values for cycles, so the first cycle
     604             : /// in the schedule is the smallest cycle value.
     605             : class SMSchedule {
     606             : private:
     607             :   /// Map from execution cycle to instructions.
     608             :   DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
     609             : 
     610             :   /// Map from instruction to execution cycle.
     611             :   std::map<SUnit *, int> InstrToCycle;
     612             : 
     613             :   /// Map for each register and the max difference between its uses and def.
     614             :   /// The first element in the pair is the max difference in stages. The
     615             :   /// second is true if the register defines a Phi value and loop value is
     616             :   /// scheduled before the Phi.
     617             :   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
     618             : 
     619             :   /// Keep track of the first cycle value in the schedule.  It starts
     620             :   /// as zero, but the algorithm allows negative values.
     621             :   int FirstCycle = 0;
     622             : 
     623             :   /// Keep track of the last cycle value in the schedule.
     624             :   int LastCycle = 0;
     625             : 
     626             :   /// The initiation interval (II) for the schedule.
     627             :   int InitiationInterval = 0;
     628             : 
     629             :   /// Target machine information.
     630             :   const TargetSubtargetInfo &ST;
     631             : 
     632             :   /// Virtual register information.
     633             :   MachineRegisterInfo &MRI;
     634             : 
     635             :   std::unique_ptr<DFAPacketizer> Resources;
     636             : 
     637             : public:
     638         188 :   SMSchedule(MachineFunction *mf)
     639         564 :       : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
     640         188 :         Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
     641             : 
     642         277 :   void reset() {
     643         277 :     ScheduledInstrs.clear();
     644             :     InstrToCycle.clear();
     645             :     RegToStageDiff.clear();
     646         277 :     FirstCycle = 0;
     647         277 :     LastCycle = 0;
     648         277 :     InitiationInterval = 0;
     649         277 :   }
     650             : 
     651             :   /// Set the initiation interval for this schedule.
     652         272 :   void setInitiationInterval(int ii) { InitiationInterval = ii; }
     653             : 
     654             :   /// Return the first cycle in the completed schedule.  This
     655             :   /// can be a negative value.
     656           0 :   int getFirstCycle() const { return FirstCycle; }
     657             : 
     658             :   /// Return the last cycle in the finalized schedule.
     659        1150 :   int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
     660             : 
     661             :   /// Return the cycle of the earliest scheduled instruction in the dependence
     662             :   /// chain.
     663             :   int earliestCycleInChain(const SDep &Dep);
     664             : 
     665             :   /// Return the cycle of the latest scheduled instruction in the dependence
     666             :   /// chain.
     667             :   int latestCycleInChain(const SDep &Dep);
     668             : 
     669             :   void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
     670             :                     int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
     671             :   bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
     672             : 
     673             :   /// Iterators for the cycle to instruction map.
     674             :   using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
     675             :   using const_sched_iterator =
     676             :       DenseMap<int, std::deque<SUnit *>>::const_iterator;
     677             : 
     678             :   /// Return true if the instruction is scheduled at the specified stage.
     679        4788 :   bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
     680        4788 :     return (stageScheduled(SU) == (int)StageNum);
     681             :   }
     682             : 
     683             :   /// Return the stage for a scheduled instruction.  Return -1 if
     684             :   /// the instruction has not been scheduled.
     685             :   int stageScheduled(SUnit *SU) const {
     686             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
     687       29593 :     if (it == InstrToCycle.end())
     688             :       return -1;
     689       28080 :     return (it->second - FirstCycle) / InitiationInterval;
     690             :   }
     691             : 
     692             :   /// Return the cycle for a scheduled instruction. This function normalizes
     693             :   /// the first cycle to be 0.
     694             :   unsigned cycleScheduled(SUnit *SU) const {
     695             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
     696             :     assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
     697        5955 :     return (it->second - FirstCycle) % InitiationInterval;
     698             :   }
     699             : 
     700             :   /// Return the maximum stage count needed for this schedule.
     701             :   unsigned getMaxStageCount() {
     702         500 :     return (LastCycle - FirstCycle) / InitiationInterval;
     703             :   }
     704             : 
     705             :   /// Return the max. number of stages/iterations that can occur between a
     706             :   /// register definition and its uses.
     707        3078 :   unsigned getStagesForReg(int Reg, unsigned CurStage) {
     708        3078 :     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
     709        3078 :     if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
     710           6 :       return 1;
     711             :     return Stages.first;
     712             :   }
     713             : 
     714             :   /// The number of stages for a Phi is a little different than other
     715             :   /// instructions. The minimum value computed in RegToStageDiff is 1
     716             :   /// because we assume the Phi is needed for at least 1 iteration.
     717             :   /// This is not the case if the loop value is scheduled prior to the
     718             :   /// Phi in the same stage.  This function returns the number of stages
     719             :   /// or iterations needed between the Phi definition and any uses.
     720             :   unsigned getStagesForPhi(int Reg) {
     721         864 :     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
     722         432 :     if (Stages.second)
     723             :       return Stages.first;
     724         399 :     return Stages.first - 1;
     725             :   }
     726             : 
     727             :   /// Return the instructions that are scheduled at the specified cycle.
     728             :   std::deque<SUnit *> &getInstructions(int cycle) {
     729       24141 :     return ScheduledInstrs[cycle];
     730             :   }
     731             : 
     732             :   bool isValidSchedule(SwingSchedulerDAG *SSD);
     733             :   void finalizeSchedule(SwingSchedulerDAG *SSD);
     734             :   void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
     735             :                        std::deque<SUnit *> &Insts);
     736             :   bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
     737             :   bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
     738             :                              MachineOperand &MO);
     739             :   void print(raw_ostream &os) const;
     740             :   void dump() const;
     741             : };
     742             : 
     743             : } // end anonymous namespace
     744             : 
     745             : unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
     746             : char MachinePipeliner::ID = 0;
     747             : #ifndef NDEBUG
     748             : int MachinePipeliner::NumTries = 0;
     749             : #endif
     750             : char &llvm::MachinePipelinerID = MachinePipeliner::ID;
     751             : 
     752       31780 : INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
     753             :                       "Modulo Software Pipelining", false, false)
     754       31780 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
     755       31780 : INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
     756       31780 : INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
     757       31780 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
     758       85990 : INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
     759             :                     "Modulo Software Pipelining", false, false)
     760             : 
     761             : /// The "main" function for implementing Swing Modulo Scheduling.
     762        3294 : bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
     763        3294 :   if (skipFunction(mf.getFunction()))
     764             :     return false;
     765             : 
     766        3284 :   if (!EnableSWP)
     767             :     return false;
     768             : 
     769        3275 :   if (mf.getFunction().getAttributes().hasAttribute(
     770        3275 :           AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
     771          33 :       !EnableSWPOptSize.getPosition())
     772          31 :     return false;
     773             : 
     774        3244 :   MF = &mf;
     775        3244 :   MLI = &getAnalysis<MachineLoopInfo>();
     776        3244 :   MDT = &getAnalysis<MachineDominatorTree>();
     777        3244 :   TII = MF->getSubtarget().getInstrInfo();
     778        3244 :   RegClassInfo.runOnMachineFunction(*MF);
     779             : 
     780        3601 :   for (auto &L : *MLI)
     781         357 :     scheduleLoop(*L);
     782             : 
     783             :   return false;
     784             : }
     785             : 
     786             : /// Attempt to perform the SMS algorithm on the specified loop. This function is
     787             : /// the main entry point for the algorithm.  The function identifies candidate
     788             : /// loops, calculates the minimum initiation interval, and attempts to schedule
     789             : /// the loop.
     790         395 : bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
     791             :   bool Changed = false;
     792         433 :   for (auto &InnerLoop : L)
     793          38 :     Changed |= scheduleLoop(*InnerLoop);
     794             : 
     795             : #ifndef NDEBUG
     796             :   // Stop trying after reaching the limit (if any).
     797             :   int Limit = SwpLoopLimit;
     798             :   if (Limit >= 0) {
     799             :     if (NumTries >= SwpLoopLimit)
     800             :       return Changed;
     801             :     NumTries++;
     802             :   }
     803             : #endif
     804             : 
     805         395 :   if (!canPipelineLoop(L))
     806             :     return Changed;
     807             : 
     808             :   ++NumTrytoPipeline;
     809             : 
     810         190 :   Changed = swingModuloScheduler(L);
     811             : 
     812         190 :   return Changed;
     813             : }
     814             : 
     815             : /// Return true if the loop can be software pipelined.  The algorithm is
     816             : /// restricted to loops with a single basic block.  Make sure that the
     817             : /// branch in the loop can be analyzed.
     818         395 : bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
     819         395 :   if (L.getNumBlocks() != 1)
     820             :     return false;
     821             : 
     822             :   // Check if the branch can't be understood because we can't do pipelining
     823             :   // if that's the case.
     824         333 :   LI.TBB = nullptr;
     825         333 :   LI.FBB = nullptr;
     826             :   LI.BrCond.clear();
     827         666 :   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
     828             :     return false;
     829             : 
     830         333 :   LI.LoopInductionVar = nullptr;
     831         333 :   LI.LoopCompare = nullptr;
     832         333 :   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
     833             :     return false;
     834             : 
     835         191 :   if (!L.getLoopPreheader())
     836             :     return false;
     837             : 
     838             :   // Remove any subregisters from inputs to phi nodes.
     839         190 :   preprocessPhiNodes(*L.getHeader());
     840         190 :   return true;
     841             : }
     842             : 
     843         190 : void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
     844         190 :   MachineRegisterInfo &MRI = MF->getRegInfo();
     845         190 :   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
     846             : 
     847         803 :   for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
     848         423 :     MachineOperand &DefOp = PI.getOperand(0);
     849             :     assert(DefOp.getSubReg() == 0);
     850         423 :     auto *RC = MRI.getRegClass(DefOp.getReg());
     851             : 
     852        1269 :     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
     853         846 :       MachineOperand &RegOp = PI.getOperand(i);
     854         846 :       if (RegOp.getSubReg() == 0)
     855         833 :         continue;
     856             : 
     857             :       // If the operand uses a subregister, replace it with a new register
     858             :       // without subregisters, and generate a copy to the new register.
     859          13 :       unsigned NewReg = MRI.createVirtualRegister(RC);
     860          13 :       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
     861          13 :       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
     862             :       const DebugLoc &DL = PredB.findDebugLoc(At);
     863          26 :       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
     864             :                     .addReg(RegOp.getReg(), getRegState(RegOp),
     865          13 :                             RegOp.getSubReg());
     866          13 :       Slots.insertMachineInstrInMaps(*Copy);
     867          13 :       RegOp.setReg(NewReg);
     868             :       RegOp.setSubReg(0);
     869             :     }
     870             :   }
     871         190 : }
     872             : 
     873             : /// The SMS algorithm consists of the following main steps:
     874             : /// 1. Computation and analysis of the dependence graph.
     875             : /// 2. Ordering of the nodes (instructions).
     876             : /// 3. Attempt to Schedule the loop.
     877         190 : bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
     878             :   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
     879             : 
     880         190 :   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
     881             : 
     882             :   MachineBasicBlock *MBB = L.getHeader();
     883             :   // The kernel should not include any terminator instructions.  These
     884             :   // will be added back later.
     885         190 :   SMS.startBlock(MBB);
     886             : 
     887             :   // Compute the number of 'real' instructions in the basic block by
     888             :   // ignoring terminators.
     889             :   unsigned size = MBB->size();
     890         569 :   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
     891             :                                    E = MBB->instr_end();
     892         569 :        I != E; ++I, --size)
     893             :     ;
     894             : 
     895         190 :   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
     896         190 :   SMS.schedule();
     897         190 :   SMS.exitRegion();
     898             : 
     899         190 :   SMS.finishBlock();
     900         190 :   return SMS.hasNewSchedule();
     901             : }
     902             : 
     903             : /// We override the schedule function in ScheduleDAGInstrs to implement the
     904             : /// scheduling part of the Swing Modulo Scheduling algorithm.
     905         190 : void SwingSchedulerDAG::schedule() {
     906         190 :   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
     907         190 :   buildSchedGraph(AA);
     908         190 :   addLoopCarriedDependences(AA);
     909         190 :   updatePhiDependences();
     910         190 :   Topo.InitDAGTopologicalSorting();
     911         190 :   changeDependences();
     912             :   postprocessDAG();
     913             :   LLVM_DEBUG(dump());
     914             : 
     915         125 :   NodeSetType NodeSets;
     916         190 :   findCircuits(NodeSets);
     917         125 :   NodeSetType Circuits = NodeSets;
     918             : 
     919             :   // Calculate the MII.
     920         190 :   unsigned ResMII = calculateResMII();
     921         190 :   unsigned RecMII = calculateRecMII(NodeSets);
     922             : 
     923         190 :   fuseRecs(NodeSets);
     924             : 
     925             :   // This flag is used for testing and can cause correctness problems.
     926         190 :   if (SwpIgnoreRecMII)
     927           1 :     RecMII = 0;
     928             : 
     929         190 :   MII = std::max(ResMII, RecMII);
     930             :   LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII
     931             :                     << ", res=" << ResMII << ")\n");
     932             : 
     933             :   // Can't schedule a loop without a valid MII.
     934         190 :   if (MII == 0)
     935          65 :     return;
     936             : 
     937             :   // Don't pipeline large loops.
     938         190 :   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
     939             :     return;
     940             : 
     941         188 :   computeNodeFunctions(NodeSets);
     942             : 
     943         188 :   registerPressureFilter(NodeSets);
     944             : 
     945         188 :   colocateNodeSets(NodeSets);
     946             : 
     947         188 :   checkNodeSets(NodeSets);
     948             : 
     949             :   LLVM_DEBUG({
     950             :     for (auto &I : NodeSets) {
     951             :       dbgs() << "  Rec NodeSet ";
     952             :       I.dump();
     953             :     }
     954             :   });
     955             : 
     956             :   std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
     957             : 
     958         188 :   groupRemainingNodes(NodeSets);
     959             : 
     960         188 :   removeDuplicateNodes(NodeSets);
     961             : 
     962             :   LLVM_DEBUG({
     963             :     for (auto &I : NodeSets) {
     964             :       dbgs() << "  NodeSet ";
     965             :       I.dump();
     966             :     }
     967             :   });
     968             : 
     969         188 :   computeNodeOrder(NodeSets);
     970             : 
     971             :   // check for node order issues
     972         188 :   checkValidNodeOrder(Circuits);
     973             : 
     974         313 :   SMSchedule Schedule(Pass.MF);
     975         188 :   Scheduled = schedulePipeline(Schedule);
     976             : 
     977         188 :   if (!Scheduled)
     978          63 :     return;
     979             : 
     980             :   unsigned numStages = Schedule.getMaxStageCount();
     981             :   // No need to generate pipeline if there are no overlapped iterations.
     982         125 :   if (numStages == 0)
     983             :     return;
     984             : 
     985             :   // Check that the maximum stage count is less than user-defined limit.
     986         125 :   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
     987             :     return;
     988             : 
     989         125 :   generatePipelinedLoop(Schedule);
     990             :   ++NumPipelined;
     991             : }
     992             : 
     993             : /// Clean up after the software pipeliner runs.
     994         190 : void SwingSchedulerDAG::finishBlock() {
     995         211 :   for (MachineInstr *I : NewMIs)
     996          21 :     MF.DeleteMachineInstr(I);
     997         190 :   NewMIs.clear();
     998             : 
     999             :   // Call the superclass.
    1000         190 :   ScheduleDAGInstrs::finishBlock();
    1001         190 : }
    1002             : 
    1003             : /// Return the register values for  the operands of a Phi instruction.
    1004             : /// This function assume the instruction is a Phi.
    1005             : static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
    1006             :                        unsigned &InitVal, unsigned &LoopVal) {
    1007             :   assert(Phi.isPHI() && "Expecting a Phi.");
    1008             : 
    1009             :   InitVal = 0;
    1010             :   LoopVal = 0;
    1011        8049 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
    1012       10732 :     if (Phi.getOperand(i + 1).getMBB() != Loop)
    1013        1048 :       InitVal = Phi.getOperand(i).getReg();
    1014             :     else
    1015        2683 :       LoopVal = Phi.getOperand(i).getReg();
    1016             : 
    1017             :   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
    1018             : }
    1019             : 
    1020             : /// Return the Phi register value that comes from the incoming block.
    1021             : static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
    1022         342 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
    1023         684 :     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
    1024         342 :       return Phi.getOperand(i).getReg();
    1025             :   return 0;
    1026             : }
    1027             : 
    1028             : /// Return the Phi register value that comes the loop block.
    1029             : static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
    1030         931 :   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
    1031        1750 :     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
    1032         382 :       return Phi.getOperand(i).getReg();
    1033             :   return 0;
    1034             : }
    1035             : 
    1036             : /// Return true if SUb can be reached from SUa following the chain edges.
    1037         364 : static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
    1038             :   SmallPtrSet<SUnit *, 8> Visited;
    1039             :   SmallVector<SUnit *, 8> Worklist;
    1040         364 :   Worklist.push_back(SUa);
    1041         530 :   while (!Worklist.empty()) {
    1042             :     const SUnit *SU = Worklist.pop_back_val();
    1043        1118 :     for (auto &SI : SU->Succs) {
    1044         952 :       SUnit *SuccSU = SI.getSUnit();
    1045         952 :       if (SI.getKind() == SDep::Order) {
    1046         543 :         if (Visited.count(SuccSU))
    1047           0 :           continue;
    1048         543 :         if (SuccSU == SUb)
    1049         275 :           return true;
    1050         268 :         Worklist.push_back(SuccSU);
    1051         268 :         Visited.insert(SuccSU);
    1052             :       }
    1053             :     }
    1054             :   }
    1055             :   return false;
    1056             : }
    1057             : 
    1058             : /// Return true if the instruction causes a chain between memory
    1059             : /// references before and after it.
    1060        2011 : static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
    1061        4020 :   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
    1062        2010 :          (MI.hasOrderedMemoryRef() &&
    1063           2 :           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
    1064             : }
    1065             : 
    1066             : /// Return the underlying objects for the memory references of an instruction.
    1067             : /// This function calls the code in ValueTracking, but first checks that the
    1068             : /// instruction has a memory operand.
    1069         484 : static void getUnderlyingObjects(MachineInstr *MI,
    1070             :                                  SmallVectorImpl<Value *> &Objs,
    1071             :                                  const DataLayout &DL) {
    1072         484 :   if (!MI->hasOneMemOperand())
    1073             :     return;
    1074         446 :   MachineMemOperand *MM = *MI->memoperands_begin();
    1075         446 :   if (!MM->getValue())
    1076             :     return;
    1077         446 :   GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
    1078         598 :   for (Value *V : Objs) {
    1079         446 :     if (!isIdentifiedObject(V)) {
    1080             :       Objs.clear();
    1081         294 :       return;
    1082             :     }
    1083         152 :     Objs.push_back(V);
    1084             :   }
    1085             : }
    1086             : 
    1087             : /// Add a chain edge between a load and store if the store can be an
    1088             : /// alias of the load on a subsequent iteration, i.e., a loop carried
    1089             : /// dependence. This code is very similar to the code in ScheduleDAGInstrs
    1090             : /// but that code doesn't create loop carried dependences.
    1091         190 : void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
    1092             :   MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
    1093             :   Value *UnknownValue =
    1094         190 :     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
    1095        2201 :   for (auto &SU : SUnits) {
    1096        2011 :     MachineInstr &MI = *SU.getInstr();
    1097        2011 :     if (isDependenceBarrier(MI, AA))
    1098             :       PendingLoads.clear();
    1099        2008 :     else if (MI.mayLoad()) {
    1100             :       SmallVector<Value *, 4> Objs;
    1101         325 :       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
    1102         325 :       if (Objs.empty())
    1103         226 :         Objs.push_back(UnknownValue);
    1104         749 :       for (auto V : Objs) {
    1105         424 :         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
    1106         424 :         SUs.push_back(&SU);
    1107             :       }
    1108        1683 :     } else if (MI.mayStore()) {
    1109             :       SmallVector<Value *, 4> Objs;
    1110         159 :       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
    1111         159 :       if (Objs.empty())
    1112         106 :         Objs.push_back(UnknownValue);
    1113         371 :       for (auto V : Objs) {
    1114             :         MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
    1115         212 :             PendingLoads.find(V);
    1116         212 :         if (I == PendingLoads.end())
    1117             :           continue;
    1118         476 :         for (auto Load : I->second) {
    1119         364 :           if (isSuccOrder(Load, &SU))
    1120         299 :             continue;
    1121          89 :           MachineInstr &LdMI = *Load->getInstr();
    1122             :           // First, perform the cheaper check that compares the base register.
    1123             :           // If they are the same and the load offset is less than the store
    1124             :           // offset, then mark the dependence as loop carried potentially.
    1125             :           unsigned BaseReg1, BaseReg2;
    1126             :           int64_t Offset1, Offset2;
    1127         177 :           if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
    1128          88 :               TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
    1129          86 :             if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
    1130             :               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
    1131             :                      "What happened to the chain edge?");
    1132             :               SDep Dep(Load, SDep::Barrier);
    1133             :               Dep.setLatency(1);
    1134          24 :               SU.addPred(Dep);
    1135             :               continue;
    1136             :             }
    1137             :           }
    1138             :           // Second, the more expensive check that uses alias analysis on the
    1139             :           // base registers. If they alias, and the load offset is less than
    1140             :           // the store offset, the mark the dependence as loop carried.
    1141          65 :           if (!AA) {
    1142             :             SDep Dep(Load, SDep::Barrier);
    1143             :             Dep.setLatency(1);
    1144           0 :             SU.addPred(Dep);
    1145             :             continue;
    1146             :           }
    1147          65 :           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
    1148          65 :           MachineMemOperand *MMO2 = *MI.memoperands_begin();
    1149         130 :           if (!MMO1->getValue() || !MMO2->getValue()) {
    1150             :             SDep Dep(Load, SDep::Barrier);
    1151             :             Dep.setLatency(1);
    1152           0 :             SU.addPred(Dep);
    1153             :             continue;
    1154             :           }
    1155          65 :           if (MMO1->getValue() == MMO2->getValue() &&
    1156          20 :               MMO1->getOffset() <= MMO2->getOffset()) {
    1157             :             SDep Dep(Load, SDep::Barrier);
    1158             :             Dep.setLatency(1);
    1159           0 :             SU.addPred(Dep);
    1160             :             continue;
    1161             :           }
    1162          65 :           AliasResult AAResult = AA->alias(
    1163          65 :               MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
    1164             :                              MMO1->getAAInfo()),
    1165          65 :               MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
    1166             :                              MMO2->getAAInfo()));
    1167             : 
    1168          65 :           if (AAResult != NoAlias) {
    1169             :             SDep Dep(Load, SDep::Barrier);
    1170             :             Dep.setLatency(1);
    1171          20 :             SU.addPred(Dep);
    1172             :           }
    1173             :         }
    1174             :       }
    1175             :     }
    1176             :   }
    1177         190 : }
    1178             : 
    1179             : /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
    1180             : /// processes dependences for PHIs. This function adds true dependences
    1181             : /// from a PHI to a use, and a loop carried dependence from the use to the
    1182             : /// PHI. The loop carried dependence is represented as an anti dependence
    1183             : /// edge. This function also removes chain dependences between unrelated
    1184             : /// PHIs.
    1185         190 : void SwingSchedulerDAG::updatePhiDependences() {
    1186             :   SmallVector<SDep, 4> RemoveDeps;
    1187         190 :   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
    1188             : 
    1189             :   // Iterate over each DAG node.
    1190        2201 :   for (SUnit &I : SUnits) {
    1191             :     RemoveDeps.clear();
    1192             :     // Set to true if the instruction has an operand defined by a Phi.
    1193             :     unsigned HasPhiUse = 0;
    1194             :     unsigned HasPhiDef = 0;
    1195        2011 :     MachineInstr *MI = I.getInstr();
    1196             :     // Iterate over each operand, and we process the definitions.
    1197        7591 :     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
    1198        2011 :                                     MOE = MI->operands_end();
    1199        9602 :          MOI != MOE; ++MOI) {
    1200        7591 :       if (!MOI->isReg())
    1201             :         continue;
    1202        5668 :       unsigned Reg = MOI->getReg();
    1203        5668 :       if (MOI->isDef()) {
    1204             :         // If the register is used by a Phi, then create an anti dependence.
    1205        2738 :         for (MachineRegisterInfo::use_instr_iterator
    1206        2014 :                  UI = MRI.use_instr_begin(Reg),
    1207             :                  UE = MRI.use_instr_end();
    1208        7490 :              UI != UE; ++UI) {
    1209             :           MachineInstr *UseMI = &*UI;
    1210             :           SUnit *SU = getSUnit(UseMI);
    1211        2665 :           if (SU != nullptr && UseMI->isPHI()) {
    1212             :             if (!MI->isPHI()) {
    1213             :               SDep Dep(SU, SDep::Anti, Reg);
    1214             :               Dep.setLatency(1);
    1215         394 :               I.addPred(Dep);
    1216             :             } else {
    1217             :               HasPhiDef = Reg;
    1218             :               // Add a chain edge to a dependent Phi that isn't an existing
    1219             :               // predecessor.
    1220          40 :               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
    1221          14 :                 I.addPred(SDep(SU, SDep::Barrier));
    1222             :             }
    1223             :           }
    1224             :         }
    1225             :       } else if (MOI->isUse()) {
    1226             :         // If the register is defined by a Phi, then create a true dependence.
    1227        3654 :         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
    1228        3654 :         if (DefMI == nullptr)
    1229             :           continue;
    1230             :         SUnit *SU = getSUnit(DefMI);
    1231        2688 :         if (SU != nullptr && DefMI->isPHI()) {
    1232             :           if (!MI->isPHI()) {
    1233             :             SDep Dep(SU, SDep::Data, Reg);
    1234             :             Dep.setLatency(0);
    1235         757 :             ST.adjustSchedDependency(SU, &I, Dep);
    1236         757 :             I.addPred(Dep);
    1237             :           } else {
    1238             :             HasPhiUse = Reg;
    1239             :             // Add a chain edge to a dependent Phi that isn't an existing
    1240             :             // predecessor.
    1241          38 :             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
    1242           0 :               I.addPred(SDep(SU, SDep::Barrier));
    1243             :           }
    1244             :         }
    1245             :       }
    1246             :     }
    1247             :     // Remove order dependences from an unrelated Phi.
    1248        2011 :     if (!SwpPruneDeps)
    1249             :       continue;
    1250        4897 :     for (auto &PI : I.Preds) {
    1251        2886 :       MachineInstr *PMI = PI.getSUnit()->getInstr();
    1252        1176 :       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
    1253          14 :         if (I.getInstr()->isPHI()) {
    1254          14 :           if (PMI->getOperand(0).getReg() == HasPhiUse)
    1255             :             continue;
    1256          28 :           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
    1257             :             continue;
    1258             :         }
    1259           0 :         RemoveDeps.push_back(PI);
    1260             :       }
    1261             :     }
    1262        2011 :     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
    1263           0 :       I.removePred(RemoveDeps[i]);
    1264             :   }
    1265         190 : }
    1266             : 
    1267             : /// Iterate over each DAG node and see if we can change any dependences
    1268             : /// in order to reduce the recurrence MII.
    1269         190 : void SwingSchedulerDAG::changeDependences() {
    1270             :   // See if an instruction can use a value from the previous iteration.
    1271             :   // If so, we update the base and offset of the instruction and change
    1272             :   // the dependences.
    1273        2201 :   for (SUnit &I : SUnits) {
    1274        2011 :     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
    1275        2011 :     int64_t NewOffset = 0;
    1276        2011 :     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
    1277             :                                NewOffset))
    1278        1986 :       continue;
    1279             : 
    1280             :     // Get the MI and SUnit for the instruction that defines the original base.
    1281          25 :     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
    1282          25 :     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
    1283          25 :     if (!DefMI)
    1284             :       continue;
    1285             :     SUnit *DefSU = getSUnit(DefMI);
    1286          25 :     if (!DefSU)
    1287           0 :       continue;
    1288             :     // Get the MI and SUnit for the instruction that defins the new base.
    1289          25 :     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
    1290          25 :     if (!LastMI)
    1291             :       continue;
    1292             :     SUnit *LastSU = getSUnit(LastMI);
    1293          25 :     if (!LastSU)
    1294           0 :       continue;
    1295             : 
    1296          25 :     if (Topo.IsReachable(&I, LastSU))
    1297             :       continue;
    1298             : 
    1299             :     // Remove the dependence. The value now depends on a prior iteration.
    1300             :     SmallVector<SDep, 4> Deps;
    1301          50 :     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
    1302             :          ++P)
    1303          25 :       if (P->getSUnit() == DefSU)
    1304          25 :         Deps.push_back(*P);
    1305          50 :     for (int i = 0, e = Deps.size(); i != e; i++) {
    1306          50 :       Topo.RemovePred(&I, Deps[i].getSUnit());
    1307          25 :       I.removePred(Deps[i]);
    1308             :     }
    1309             :     // Remove the chain dependence between the instructions.
    1310             :     Deps.clear();
    1311         123 :     for (auto &P : LastSU->Preds)
    1312          98 :       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
    1313          22 :         Deps.push_back(P);
    1314          47 :     for (int i = 0, e = Deps.size(); i != e; i++) {
    1315          44 :       Topo.RemovePred(LastSU, Deps[i].getSUnit());
    1316          22 :       LastSU->removePred(Deps[i]);
    1317             :     }
    1318             : 
    1319             :     // Add a dependence between the new instruction and the instruction
    1320             :     // that defines the new base.
    1321          25 :     SDep Dep(&I, SDep::Anti, NewBase);
    1322          25 :     Topo.AddPred(LastSU, &I);
    1323          25 :     LastSU->addPred(Dep);
    1324             : 
    1325             :     // Remember the base and offset information so that we can update the
    1326             :     // instruction during code generation.
    1327          25 :     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
    1328             :   }
    1329         190 : }
    1330             : 
    1331             : namespace {
    1332             : 
    1333             : // FuncUnitSorter - Comparison operator used to sort instructions by
    1334             : // the number of functional unit choices.
    1335       12744 : struct FuncUnitSorter {
    1336             :   const InstrItineraryData *InstrItins;
    1337             :   DenseMap<unsigned, unsigned> Resources;
    1338             : 
    1339         190 :   FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
    1340             : 
    1341             :   // Compute the number of functional unit alternatives needed
    1342             :   // at each stage, and take the minimum value. We prioritize the
    1343             :   // instructions by the least number of choices first.
    1344           0 :   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
    1345        5895 :     unsigned schedClass = Inst->getDesc().getSchedClass();
    1346             :     unsigned min = UINT_MAX;
    1347       17794 :     for (const InstrStage *IS = InstrItins->beginStage(schedClass),
    1348             :                           *IE = InstrItins->endStage(schedClass);
    1349       29584 :          IS != IE; ++IS) {
    1350       17794 :       unsigned funcUnits = IS->getUnits();
    1351             :       unsigned numAlternatives = countPopulation(funcUnits);
    1352       17794 :       if (numAlternatives < min) {
    1353             :         min = numAlternatives;
    1354       13113 :         F = funcUnits;
    1355             :       }
    1356             :     }
    1357           0 :     return min;
    1358             :   }
    1359             : 
    1360             :   // Compute the critical resources needed by the instruction. This
    1361             :   // function records the functional units needed by instructions that
    1362             :   // must use only one functional unit. We use this as a tie breaker
    1363             :   // for computing the resource MII. The instrutions that require
    1364             :   // the same, highly used, functional unit have high priority.
    1365        1589 :   void calcCriticalResources(MachineInstr &MI) {
    1366        1589 :     unsigned SchedClass = MI.getDesc().getSchedClass();
    1367        3945 :     for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
    1368             :                           *IE = InstrItins->endStage(SchedClass);
    1369        3945 :          IS != IE; ++IS) {
    1370        2356 :       unsigned FuncUnits = IS->getUnits();
    1371        2356 :       if (countPopulation(FuncUnits) == 1)
    1372         674 :         Resources[FuncUnits]++;
    1373             :     }
    1374        1589 :   }
    1375             : 
    1376             :   /// Return true if IS1 has less priority than IS2.
    1377        5895 :   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
    1378        5895 :     unsigned F1 = 0, F2 = 0;
    1379        5895 :     unsigned MFUs1 = minFuncUnits(IS1, F1);
    1380             :     unsigned MFUs2 = minFuncUnits(IS2, F2);
    1381        5895 :     if (MFUs1 == 1 && MFUs2 == 1)
    1382        2232 :       return Resources.lookup(F1) < Resources.lookup(F2);
    1383        4779 :     return MFUs1 > MFUs2;
    1384             :   }
    1385             : };
    1386             : 
    1387             : } // end anonymous namespace
    1388             : 
    1389             : /// Calculate the resource constrained minimum initiation interval for the
    1390             : /// specified loop. We use the DFA to model the resources needed for
    1391             : /// each instruction, and we ignore dependences. A different DFA is created
    1392             : /// for each cycle that is required. When adding a new instruction, we attempt
    1393             : /// to add it to each existing DFA, until a legal space is found. If the
    1394             : /// instruction cannot be reserved in an existing DFA, we create a new one.
    1395         190 : unsigned SwingSchedulerDAG::calculateResMII() {
    1396             :   SmallVector<DFAPacketizer *, 8> Resources;
    1397         190 :   MachineBasicBlock *MBB = Loop.getHeader();
    1398         190 :   Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
    1399             : 
    1400             :   // Sort the instructions by the number of available choices for scheduling,
    1401             :   // least to most. Use the number of critical resources as the tie breaker.
    1402             :   FuncUnitSorter FUS =
    1403         190 :       FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
    1404         190 :   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
    1405         190 :                                    E = MBB->getFirstTerminator();
    1406        1779 :        I != E; ++I)
    1407        1589 :     FUS.calcCriticalResources(*I);
    1408             :   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
    1409         190 :       FuncUnitOrder(FUS);
    1410             : 
    1411         190 :   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
    1412         190 :                                    E = MBB->getFirstTerminator();
    1413        1779 :        I != E; ++I)
    1414        1589 :     FuncUnitOrder.push(&*I);
    1415             : 
    1416        1779 :   while (!FuncUnitOrder.empty()) {
    1417        1589 :     MachineInstr *MI = FuncUnitOrder.top();
    1418        1589 :     FuncUnitOrder.pop();
    1419        3178 :     if (TII->isZeroCost(MI->getOpcode()))
    1420             :       continue;
    1421             :     // Attempt to reserve the instruction in an existing DFA. At least one
    1422             :     // DFA is needed for each cycle.
    1423        1507 :     unsigned NumCycles = getSUnit(MI)->Latency;
    1424             :     unsigned ReservedCycles = 0;
    1425             :     SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
    1426             :     SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
    1427        3014 :     for (unsigned C = 0; C < NumCycles; ++C)
    1428        6050 :       while (RI != RE) {
    1429        5715 :         if ((*RI++)->canReserveResources(*MI)) {
    1430        1172 :           ++ReservedCycles;
    1431        1172 :           break;
    1432             :         }
    1433             :       }
    1434             :     // Start reserving resources using existing DFAs.
    1435        2679 :     for (unsigned C = 0; C < ReservedCycles; ++C) {
    1436        1172 :       --RI;
    1437        1172 :       (*RI)->reserveResources(*MI);
    1438             :     }
    1439             :     // Add new DFAs, if needed, to reserve resources.
    1440        1842 :     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
    1441             :       DFAPacketizer *NewResource =
    1442         335 :           TII->CreateTargetScheduleState(MF.getSubtarget());
    1443             :       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
    1444         335 :       NewResource->reserveResources(*MI);
    1445         335 :       Resources.push_back(NewResource);
    1446             :     }
    1447             :   }
    1448         190 :   int Resmii = Resources.size();
    1449             :   // Delete the memory for each of the DFAs that were created earlier.
    1450         715 :   for (DFAPacketizer *RI : Resources) {
    1451             :     DFAPacketizer *D = RI;
    1452        1050 :     delete D;
    1453             :   }
    1454             :   Resources.clear();
    1455         190 :   return Resmii;
    1456             : }
    1457             : 
    1458             : /// Calculate the recurrence-constrainted minimum initiation interval.
    1459             : /// Iterate over each circuit.  Compute the delay(c) and distance(c)
    1460             : /// for each circuit. The II needs to satisfy the inequality
    1461             : /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
    1462             : /// II that satisfies the inequality, and the RecMII is the maximum
    1463             : /// of those values.
    1464           0 : unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
    1465             :   unsigned RecMII = 0;
    1466             : 
    1467         768 :   for (NodeSet &Nodes : NodeSets) {
    1468         578 :     if (Nodes.empty())
    1469           0 :       continue;
    1470             : 
    1471         578 :     unsigned Delay = Nodes.getLatency();
    1472             :     unsigned Distance = 1;
    1473             : 
    1474             :     // ii = ceil(delay / distance)
    1475             :     unsigned CurMII = (Delay + Distance - 1) / Distance;
    1476             :     Nodes.setRecMII(CurMII);
    1477         578 :     if (CurMII > RecMII)
    1478             :       RecMII = CurMII;
    1479             :   }
    1480             : 
    1481           0 :   return RecMII;
    1482             : }
    1483             : 
    1484             : /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
    1485             : /// but we do this to find the circuits, and then change them back.
    1486         380 : static void swapAntiDependences(std::vector<SUnit> &SUnits) {
    1487             :   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
    1488        4782 :   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
    1489        4022 :     SUnit *SU = &SUnits[i];
    1490        5736 :     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
    1491        9758 :          IP != EP; ++IP) {
    1492        5736 :       if (IP->getKind() != SDep::Anti)
    1493             :         continue;
    1494         842 :       DepsAdded.push_back(std::make_pair(SU, *IP));
    1495             :     }
    1496             :   }
    1497         842 :   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
    1498             :                                                           E = DepsAdded.end();
    1499        1222 :        I != E; ++I) {
    1500             :     // Remove this anti dependency and add one in the reverse direction.
    1501         842 :     SUnit *SU = I->first;
    1502         842 :     SDep &D = I->second;
    1503             :     SUnit *TargetSU = D.getSUnit();
    1504         842 :     unsigned Reg = D.getReg();
    1505         842 :     unsigned Lat = D.getLatency();
    1506         842 :     SU->removePred(D);
    1507             :     SDep Dep(SU, SDep::Anti, Reg);
    1508             :     Dep.setLatency(Lat);
    1509         842 :     TargetSU->addPred(Dep);
    1510             :   }
    1511         380 : }
    1512             : 
    1513             : /// Create the adjacency structure of the nodes in the graph.
    1514         190 : void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
    1515             :     SwingSchedulerDAG *DAG) {
    1516         380 :   BitVector Added(SUnits.size());
    1517             :   DenseMap<int, int> OutputDeps;
    1518        2391 :   for (int i = 0, e = SUnits.size(); i != e; ++i) {
    1519             :     Added.reset();
    1520             :     // Add any successor to the adjacency matrix and exclude duplicates.
    1521        4879 :     for (auto &SI : SUnits[i].Succs) {
    1522             :       // Only create a back-edge on the first and last nodes of a dependence
    1523             :       // chain. This records any chains and adds them later.
    1524        2868 :       if (SI.getKind() == SDep::Output) {
    1525           0 :         int N = SI.getSUnit()->NodeNum;
    1526           0 :         int BackEdge = i;
    1527           0 :         auto Dep = OutputDeps.find(BackEdge);
    1528           0 :         if (Dep != OutputDeps.end()) {
    1529           0 :           BackEdge = Dep->second;
    1530             :           OutputDeps.erase(Dep);
    1531             :         }
    1532           0 :         OutputDeps[N] = BackEdge;
    1533             :       }
    1534             :       // Do not process a boundary node and a back-edge is processed only
    1535             :       // if it goes to a Phi.
    1536        2868 :       if (SI.getSUnit()->isBoundaryNode() ||
    1537         421 :           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
    1538          27 :         continue;
    1539        2841 :       int N = SI.getSUnit()->NodeNum;
    1540        2841 :       if (!Added.test(N)) {
    1541        2826 :         AdjK[i].push_back(N);
    1542        2826 :         Added.set(N);
    1543             :       }
    1544             :     }
    1545             :     // A chain edge between a store and a load is treated as a back-edge in the
    1546             :     // adjacency matrix.
    1547        4879 :     for (auto &PI : SUnits[i].Preds) {
    1548        6246 :       if (!SUnits[i].getInstr()->mayStore() ||
    1549        1020 :           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
    1550        2750 :         continue;
    1551         118 :       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
    1552         117 :         int N = PI.getSUnit()->NodeNum;
    1553         117 :         if (!Added.test(N)) {
    1554         117 :           AdjK[i].push_back(N);
    1555         117 :           Added.set(N);
    1556             :         }
    1557             :       }
    1558             :     }
    1559             :   }
    1560             :   // Add back-eges in the adjacency matrix for the output dependences.
    1561         190 :   for (auto &OD : OutputDeps)
    1562           0 :     if (!Added.test(OD.second)) {
    1563           0 :       AdjK[OD.first].push_back(OD.second);
    1564           0 :       Added.set(OD.second);
    1565             :     }
    1566         190 : }
    1567             : 
    1568             : /// Identify an elementary circuit in the dependence graph starting at the
    1569             : /// specified node.
    1570       16475 : bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
    1571             :                                           bool HasBackedge) {
    1572       16475 :   SUnit *SV = &SUnits[V];
    1573             :   bool F = false;
    1574       16475 :   Stack.insert(SV);
    1575       16475 :   Blocked.set(V);
    1576             : 
    1577       39508 :   for (auto W : AdjK[V]) {
    1578       23927 :     if (NumPaths > MaxPaths)
    1579             :       break;
    1580       23788 :     if (W < S)
    1581             :       continue;
    1582       20524 :     if (W == S) {
    1583         755 :       if (!HasBackedge)
    1584        1734 :         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
    1585             :       F = true;
    1586         755 :       ++NumPaths;
    1587             :       break;
    1588       39538 :     } else if (!Blocked.test(W)) {
    1589       14464 :       if (circuit(W, S, NodeSets,
    1590       14464 :                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
    1591             :         F = true;
    1592             :     }
    1593             :   }
    1594             : 
    1595       15720 :   if (F)
    1596        2418 :     unblock(V);
    1597             :   else {
    1598       33142 :     for (auto W : AdjK[V]) {
    1599       19085 :       if (W < S)
    1600             :         continue;
    1601       32306 :       if (B[W].count(SV) == 0)
    1602       15629 :         B[W].insert(SV);
    1603             :     }
    1604             :   }
    1605             :   Stack.pop_back();
    1606       16475 :   return F;
    1607             : }
    1608             : 
    1609             : /// Unblock a node in the circuit finding algorithm.
    1610        5468 : void SwingSchedulerDAG::Circuits::unblock(int U) {
    1611        5468 :   Blocked.reset(U);
    1612        5468 :   SmallPtrSet<SUnit *, 4> &BU = B[U];
    1613       10230 :   while (!BU.empty()) {
    1614        4762 :     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
    1615             :     assert(SI != BU.end() && "Invalid B set.");
    1616        4762 :     SUnit *W = *SI;
    1617             :     BU.erase(W);
    1618        9524 :     if (Blocked.test(W->NodeNum))
    1619        3050 :       unblock(W->NodeNum);
    1620             :   }
    1621        5468 : }
    1622             : 
    1623             : /// Identify all the elementary circuits in the dependence graph using
    1624             : /// Johnson's circuit algorithm.
    1625         190 : void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
    1626             :   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
    1627             :   // but we do this to find the circuits, and then change them back.
    1628         190 :   swapAntiDependences(SUnits);
    1629             : 
    1630         380 :   Circuits Cir(SUnits, Topo);
    1631             :   // Create the adjacency structure.
    1632         190 :   Cir.createAdjacencyStructure(this);
    1633        2391 :   for (int i = 0, e = SUnits.size(); i != e; ++i) {
    1634        2011 :     Cir.reset();
    1635        2011 :     Cir.circuit(i, i, NodeSets);
    1636             :   }
    1637             : 
    1638             :   // Change the dependences back so that we've created a DAG again.
    1639         190 :   swapAntiDependences(SUnits);
    1640         190 : }
    1641             : 
    1642             : // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
    1643             : // is loop-carried to the USE in next iteration. This will help pipeliner avoid
    1644             : // additional copies that are needed across iterations. An artificial dependence
    1645             : // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
    1646             : 
    1647             : // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
    1648             : // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
    1649             : // PHI-------True-Dep------> USEOfPhi
    1650             : 
    1651             : // The mutation creates
    1652             : // USEOfPHI -------Artificial-Dep---> SRCOfCopy
    1653             : 
    1654             : // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
    1655             : // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
    1656             : // late  to avoid additional copies across iterations. The possible scheduling
    1657             : // order would be
    1658             : // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
    1659             : 
    1660         190 : void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
    1661        2201 :   for (SUnit &SU : DAG->SUnits) {
    1662             :     // Find the COPY/REG_SEQUENCE instruction.
    1663        4022 :     if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
    1664        2004 :       continue;
    1665             : 
    1666             :     // Record the loop carried PHIs.
    1667             :     SmallVector<SUnit *, 4> PHISUs;
    1668             :     // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
    1669             :     SmallVector<SUnit *, 4> SrcSUs;
    1670             : 
    1671         228 :     for (auto &Dep : SU.Preds) {
    1672         148 :       SUnit *TmpSU = Dep.getSUnit();
    1673         148 :       MachineInstr *TmpMI = TmpSU->getInstr();
    1674             :       SDep::Kind DepKind = Dep.getKind();
    1675             :       // Save the loop carried PHI.
    1676         148 :       if (DepKind == SDep::Anti && TmpMI->isPHI())
    1677           8 :         PHISUs.push_back(TmpSU);
    1678             :       // Save the source of COPY/REG_SEQUENCE.
    1679             :       // If the source has no pre-decessors, we will end up creating cycles.
    1680         258 :       else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
    1681         115 :         SrcSUs.push_back(TmpSU);
    1682             :     }
    1683             : 
    1684          80 :     if (PHISUs.size() == 0 || SrcSUs.size() == 0)
    1685             :       continue;
    1686             : 
    1687             :     // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
    1688             :     // SUnit to the container.
    1689             :     SmallVector<SUnit *, 8> UseSUs;
    1690          18 :     for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
    1691          29 :       for (auto &Dep : (*I)->Succs) {
    1692          18 :         if (Dep.getKind() != SDep::Data)
    1693          11 :           continue;
    1694             : 
    1695          11 :         SUnit *TmpSU = Dep.getSUnit();
    1696          11 :         MachineInstr *TmpMI = TmpSU->getInstr();
    1697          11 :         if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
    1698           4 :           PHISUs.push_back(TmpSU);
    1699           4 :           continue;
    1700             :         }
    1701           7 :         UseSUs.push_back(TmpSU);
    1702             :       }
    1703             :     }
    1704             : 
    1705          14 :     if (UseSUs.size() == 0)
    1706             :       continue;
    1707             : 
    1708             :     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
    1709             :     // Add the artificial dependencies if it does not form a cycle.
    1710          14 :     for (auto I : UseSUs) {
    1711          14 :       for (auto Src : SrcSUs) {
    1712           7 :         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
    1713           4 :           Src->addPred(SDep(I, SDep::Artificial));
    1714           4 :           SDAG->Topo.AddPred(Src, I);
    1715             :         }
    1716             :       }
    1717             :     }
    1718             :   }
    1719         190 : }
    1720             : 
    1721             : /// Return true for DAG nodes that we ignore when computing the cost functions.
    1722             : /// We ignore the back-edge recurrence in order to avoid unbounded recursion
    1723             : /// in the calculation of the ASAP, ALAP, etc functions.
    1724             : static bool ignoreDependence(const SDep &D, bool isPred) {
    1725             :   if (D.isArtificial())
    1726             :     return true;
    1727       13613 :   return D.getKind() == SDep::Anti && isPred;
    1728             : }
    1729             : 
    1730             : /// Compute several functions need to order the nodes for scheduling.
    1731             : ///  ASAP - Earliest time to schedule a node.
    1732             : ///  ALAP - Latest time to schedule a node.
    1733             : ///  MOV - Mobility function, difference between ALAP and ASAP.
    1734             : ///  D - Depth of each node.
    1735             : ///  H - Height of each node.
    1736         188 : void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
    1737         376 :   ScheduleInfo.resize(SUnits.size());
    1738             : 
    1739             :   LLVM_DEBUG({
    1740             :     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
    1741             :                                                     E = Topo.end();
    1742             :          I != E; ++I) {
    1743             :       const SUnit &SU = SUnits[*I];
    1744             :       dumpNode(SU);
    1745             :     }
    1746             :   });
    1747             : 
    1748         188 :   int maxASAP = 0;
    1749             :   // Compute ASAP and ZeroLatencyDepth.
    1750             :   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
    1751             :                                                   E = Topo.end();
    1752        2033 :        I != E; ++I) {
    1753        1845 :     int asap = 0;
    1754        1845 :     int zeroLatencyDepth = 0;
    1755        1845 :     SUnit *SU = &SUnits[*I];
    1756        2603 :     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
    1757             :                                     EP = SU->Preds.end();
    1758        4448 :          IP != EP; ++IP) {
    1759             :       SUnit *pred = IP->getSUnit();
    1760        2603 :       if (IP->getLatency() == 0)
    1761        1100 :         zeroLatencyDepth =
    1762        3032 :             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
    1763             :       if (ignoreDependence(*IP, true))
    1764             :         continue;
    1765        5328 :       asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
    1766             :                                   getDistance(pred, SU, *IP) * MII));
    1767             :     }
    1768        1845 :     maxASAP = std::max(maxASAP, asap);
    1769        1845 :     ScheduleInfo[*I].ASAP = asap;
    1770        3690 :     ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
    1771             :   }
    1772             : 
    1773             :   // Compute ALAP, ZeroLatencyHeight, and MOV.
    1774             :   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
    1775             :                                                           E = Topo.rend();
    1776        2033 :        I != E; ++I) {
    1777        1845 :     int alap = maxASAP;
    1778        1845 :     int zeroLatencyHeight = 0;
    1779        1845 :     SUnit *SU = &SUnits[*I];
    1780        2603 :     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
    1781             :                                     ES = SU->Succs.end();
    1782        4448 :          IS != ES; ++IS) {
    1783             :       SUnit *succ = IS->getSUnit();
    1784        2603 :       if (IS->getLatency() == 0)
    1785        1100 :         zeroLatencyHeight =
    1786        2952 :             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
    1787             :       if (ignoreDependence(*IS, true))
    1788             :         continue;
    1789        5738 :       alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
    1790             :                                   getDistance(SU, succ, *IS) * MII));
    1791             :     }
    1792             : 
    1793        1845 :     ScheduleInfo[*I].ALAP = alap;
    1794        3690 :     ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
    1795             :   }
    1796             : 
    1797             :   // After computing the node functions, compute the summary for each node set.
    1798         608 :   for (NodeSet &I : NodeSets)
    1799         420 :     I.computeNodeSetInfo(this);
    1800             : 
    1801             :   LLVM_DEBUG({
    1802             :     for (unsigned i = 0; i < SUnits.size(); i++) {
    1803             :       dbgs() << "\tNode " << i << ":\n";
    1804             :       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
    1805             :       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
    1806             :       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
    1807             :       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
    1808             :       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
    1809             :       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
    1810             :       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
    1811             :     }
    1812             :   });
    1813         188 : }
    1814             : 
    1815             : /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
    1816             : /// as the predecessors of the elements of NodeOrder that are not also in
    1817             : /// NodeOrder.
    1818         849 : static bool pred_L(SetVector<SUnit *> &NodeOrder,
    1819             :                    SmallSetVector<SUnit *, 8> &Preds,
    1820             :                    const NodeSet *S = nullptr) {
    1821         849 :   Preds.clear();
    1822             :   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
    1823        7472 :        I != E; ++I) {
    1824       17998 :     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
    1825       17998 :          PI != PE; ++PI) {
    1826       11375 :       if (S && S->count(PI->getSUnit()) == 0)
    1827        2948 :         continue;
    1828             :       if (ignoreDependence(*PI, true))
    1829             :         continue;
    1830        7095 :       if (NodeOrder.count(PI->getSUnit()) == 0)
    1831        1018 :         Preds.insert(PI->getSUnit());
    1832             :     }
    1833             :     // Back-edges are predecessors with an anti-dependence.
    1834       16597 :     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
    1835             :                                     ES = (*I)->Succs.end();
    1836       16597 :          IS != ES; ++IS) {
    1837        9974 :       if (IS->getKind() != SDep::Anti)
    1838             :         continue;
    1839        1313 :       if (S && S->count(IS->getSUnit()) == 0)
    1840         304 :         continue;
    1841        1009 :       if (NodeOrder.count(IS->getSUnit()) == 0)
    1842           0 :         Preds.insert(IS->getSUnit());
    1843             :     }
    1844             :   }
    1845         849 :   return !Preds.empty();
    1846             : }
    1847             : 
    1848             : /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
    1849             : /// as the successors of the elements of NodeOrder that are not also in
    1850             : /// NodeOrder.
    1851        2476 : static bool succ_L(SetVector<SUnit *> &NodeOrder,
    1852             :                    SmallSetVector<SUnit *, 8> &Succs,
    1853             :                    const NodeSet *S = nullptr) {
    1854        2476 :   Succs.clear();
    1855             :   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
    1856       14473 :        I != E; ++I) {
    1857       30113 :     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
    1858       30113 :          SI != SE; ++SI) {
    1859       18116 :       if (S && S->count(SI->getSUnit()) == 0)
    1860        2946 :         continue;
    1861             :       if (ignoreDependence(*SI, false))
    1862             :         continue;
    1863       15128 :       if (NodeOrder.count(SI->getSUnit()) == 0)
    1864        1933 :         Succs.insert(SI->getSUnit());
    1865             :     }
    1866       33558 :     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
    1867             :                                     PE = (*I)->Preds.end();
    1868       33558 :          PI != PE; ++PI) {
    1869       21561 :       if (PI->getKind() != SDep::Anti)
    1870             :         continue;
    1871        3741 :       if (S && S->count(PI->getSUnit()) == 0)
    1872         468 :         continue;
    1873        3273 :       if (NodeOrder.count(PI->getSUnit()) == 0)
    1874         887 :         Succs.insert(PI->getSUnit());
    1875             :     }
    1876             :   }
    1877        2476 :   return !Succs.empty();
    1878             : }
    1879             : 
    1880             : /// Return true if there is a path from the specified node to any of the nodes
    1881             : /// in DestNodes. Keep track and return the nodes in any path.
    1882        5219 : static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
    1883             :                         SetVector<SUnit *> &DestNodes,
    1884             :                         SetVector<SUnit *> &Exclude,
    1885             :                         SmallPtrSet<SUnit *, 8> &Visited) {
    1886        5219 :   if (Cur->isBoundaryNode())
    1887             :     return false;
    1888        5219 :   if (Exclude.count(Cur) != 0)
    1889             :     return false;
    1890        8898 :   if (DestNodes.count(Cur) != 0)
    1891             :     return true;
    1892        3951 :   if (!Visited.insert(Cur).second)
    1893        1816 :     return Path.count(Cur) != 0;
    1894             :   bool FoundPath = false;
    1895        6937 :   for (auto &SI : Cur->Succs)
    1896        3894 :     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
    1897        7901 :   for (auto &PI : Cur->Preds)
    1898        4858 :     if (PI.getKind() == SDep::Anti)
    1899         291 :       FoundPath |=
    1900         291 :           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
    1901        3043 :   if (FoundPath)
    1902         693 :     Path.insert(Cur);
    1903             :   return FoundPath;
    1904             : }
    1905             : 
    1906             : /// Return true if Set1 is a subset of Set2.
    1907         571 : template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
    1908        1034 :   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
    1909        1004 :     if (Set2.count(*I) == 0)
    1910         192 :       return false;
    1911             :   return true;
    1912             : }
    1913         391 : 
    1914         787 : /// Compute the live-out registers for the instructions in a node-set.
    1915         588 : /// The live-out registers are those that are defined in the node-set,
    1916         192 : /// but not used. Except for use operands of Phis.
    1917             : static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
    1918             :                             NodeSet &NS) {
    1919         180 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    1920         247 :   MachineRegisterInfo &MRI = MF.getRegInfo();
    1921         416 :   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
    1922             :   SmallSet<unsigned, 4> Uses;
    1923             :   for (SUnit *SU : NS) {
    1924             :     const MachineInstr *MI = SU->getInstr();
    1925             :     if (MI->isPHI())
    1926             :       continue;
    1927             :     for (const MachineOperand &MO : MI->operands())
    1928             :       if (MO.isReg() && MO.isUse()) {
    1929          83 :         unsigned Reg = MO.getReg();
    1930             :         if (TargetRegisterInfo::isVirtualRegister(Reg))
    1931          83 :           Uses.insert(Reg);
    1932          83 :         else if (MRI.isAllocatable(Reg))
    1933             :           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
    1934          83 :             Uses.insert(*Units);
    1935         645 :       }
    1936         562 :   }
    1937             :   for (SUnit *SU : NS)
    1938             :     for (const MachineOperand &MO : SU->getInstr()->operands())
    1939        2519 :       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
    1940        1976 :         unsigned Reg = MO.getReg();
    1941        1131 :         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
    1942        1131 :           if (!Uses.count(Reg))
    1943        1129 :             LiveOutRegs.push_back(RegisterMaskPair(Reg,
    1944           2 :                                                    LaneBitmask::getNone()));
    1945           0 :         } else if (MRI.isAllocatable(Reg)) {
    1946           0 :           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
    1947             :             if (!Uses.count(*Units))
    1948             :               LiveOutRegs.push_back(RegisterMaskPair(*Units,
    1949         645 :                                                      LaneBitmask::getNone()));
    1950        2633 :         }
    1951        2071 :       }
    1952         542 :   RPTracker.addLiveRegs(LiveOutRegs);
    1953         542 : }
    1954         542 : 
    1955          95 : /// A heuristic to filter nodes in recurrent node-sets if the register
    1956             : /// pressure of a set is too high.
    1957           0 : void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
    1958           0 :   for (auto &NS : NodeSets) {
    1959           0 :     // Skip small node-sets since they won't cause register pressure problems.
    1960           0 :     if (NS.size() <= 2)
    1961             :       continue;
    1962             :     IntervalPressure RecRegPressure;
    1963             :     RegPressureTracker RecRPTracker(RecRegPressure);
    1964          83 :     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
    1965          83 :     computeLiveOuts(MF, RecRPTracker, NS);
    1966             :     RecRPTracker.closeBottom();
    1967             : 
    1968             :     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
    1969         188 :     llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
    1970         608 :       return A->NodeNum > B->NodeNum;
    1971             :     });
    1972         420 : 
    1973         337 :     for (auto &SU : SUnits) {
    1974             :       // Since we're computing the register pressure for a subset of the
    1975          83 :       // instructions in a block, we need to set the tracker for each
    1976         166 :       // instruction in the node-set. The tracker is set to the instruction
    1977          83 :       // just after the one we're interested in.
    1978          83 :       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
    1979             :       RecRPTracker.setPos(std::next(CurInstI));
    1980             : 
    1981             :       RegPressureDelta RPDelta;
    1982           0 :       ArrayRef<PressureChange> CriticalPSets;
    1983             :       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
    1984             :                                              CriticalPSets,
    1985         639 :                                              RecRegPressure.MaxSetPressure);
    1986             :       if (RPDelta.Excess.isValid()) {
    1987             :         LLVM_DEBUG(
    1988             :             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
    1989             :                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
    1990         557 :                    << ":" << RPDelta.Excess.getUnitInc());
    1991         557 :         NS.setExceedPressure(SU);
    1992             :         break;
    1993         557 :       }
    1994         557 :       RecRPTracker.recede();
    1995         557 :     }
    1996             :   }
    1997             : }
    1998         557 : 
    1999             : /// A heuristic to colocate node sets that have the same set of
    2000             : /// successors.
    2001             : void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
    2002             :   unsigned Colocate = 0;
    2003           1 :   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
    2004           1 :     NodeSet &N1 = NodeSets[i];
    2005             :     SmallSetVector<SUnit *, 8> S1;
    2006         556 :     if (N1.empty() || !succ_L(N1, S1))
    2007             :       continue;
    2008             :     for (int j = i + 1; j < e; ++j) {
    2009         188 :       NodeSet &N2 = NodeSets[j];
    2010             :       if (N1.compareRecMII(N2) != 0)
    2011             :         continue;
    2012             :       SmallSetVector<SUnit *, 8> S2;
    2013           0 :       if (N2.empty() || !succ_L(N2, S2))
    2014             :         continue;
    2015           0 :       if (isSubset(S1, S2) && S1.size() == S2.size()) {
    2016           0 :         N1.setColocate(++Colocate);
    2017             :         N2.setColocate(Colocate);
    2018           0 :         break;
    2019             :       }
    2020           0 :     }
    2021           0 :   }
    2022           0 : }
    2023           0 : 
    2024             : /// Check if the existing node-sets are profitable. If not, then ignore the
    2025           0 : /// recurrent node-sets, and attempt to schedule all nodes together. This is
    2026             : /// a heuristic. If the MII is large and all the recurrent node-sets are small,
    2027           0 : /// then it's best to try to schedule all instructions together instead of
    2028           0 : /// starting with the recurrent node-sets.
    2029             : void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
    2030             :   // Look for loops with a large MII.
    2031             :   if (MII < 17)
    2032             :     return;
    2033             :   // Check if the node-set contains only a simple add recurrence.
    2034           0 :   for (auto &NS : NodeSets) {
    2035             :     if (NS.getRecMII() > 2)
    2036             :       return;
    2037             :     if (NS.getMaxDepth() > MII)
    2038             :       return;
    2039             :   }
    2040             :   NodeSets.clear();
    2041           0 :   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
    2042             :   return;
    2043           0 : }
    2044           0 : 
    2045             : /// Add the nodes that do not belong to a recurrence set into groups
    2046           0 : /// based upon connected componenets.
    2047           0 : void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
    2048           0 :   SetVector<SUnit *> NodesAdded;
    2049           0 :   SmallPtrSet<SUnit *, 8> Visited;
    2050           0 :   // Add the nodes that are on a path between the previous node sets and
    2051             :   // the current node set.
    2052             :   for (NodeSet &I : NodeSets) {
    2053             :     SmallSetVector<SUnit *, 8> N;
    2054             :     // Add the nodes from the current node set to the previous node set.
    2055             :     if (succ_L(I, N)) {
    2056             :       SetVector<SUnit *> Path;
    2057             :       for (SUnit *NI : N) {
    2058             :         Visited.clear();
    2059         188 :         computePath(NI, Path, NodesAdded, I, Visited);
    2060         188 :       }
    2061             :       if (!Path.empty())
    2062             :         I.insert(Path.begin(), Path.end());
    2063             :     }
    2064         608 :     // Add the nodes from the previous node set to the current node set.
    2065             :     N.clear();
    2066             :     if (succ_L(NodesAdded, N)) {
    2067         420 :       SetVector<SUnit *> Path;
    2068         315 :       for (SUnit *NI : N) {
    2069         922 :         Visited.clear();
    2070         607 :         computePath(NI, Path, I, NodesAdded, Visited);
    2071         607 :       }
    2072             :       if (!Path.empty())
    2073         315 :         I.insert(Path.begin(), Path.end());
    2074          53 :     }
    2075             :     NodesAdded.insert(I.begin(), I.end());
    2076             :   }
    2077             : 
    2078         420 :   // Create a new node set with the connected nodes of any successor of a node
    2079         165 :   // in a recurrent set.
    2080         592 :   NodeSet NewSet;
    2081         427 :   SmallSetVector<SUnit *, 8> N;
    2082         427 :   if (succ_L(NodesAdded, N))
    2083             :     for (SUnit *I : N)
    2084         165 :       addConnectedNodes(I, NewSet, NodesAdded);
    2085          30 :   if (!NewSet.empty())
    2086             :     NodeSets.push_back(NewSet);
    2087         420 : 
    2088             :   // Create a new node set with the connected nodes of any predecessor of a node
    2089             :   // in a recurrent set.
    2090             :   NewSet.clear();
    2091             :   if (pred_L(NodesAdded, N))
    2092             :     for (SUnit *I : N)
    2093             :       addConnectedNodes(I, NewSet, NodesAdded);
    2094         188 :   if (!NewSet.empty())
    2095         218 :     NodeSets.push_back(NewSet);
    2096         141 : 
    2097         188 :   // Create new nodes sets with the connected nodes any remaining node that
    2098          77 :   // has no predecessor.
    2099             :   for (unsigned i = 0; i < SUnits.size(); ++i) {
    2100             :     SUnit *SU = &SUnits[i];
    2101             :     if (NodesAdded.count(SU) == 0) {
    2102             :       NewSet.clear();
    2103         188 :       addConnectedNodes(SU, NewSet, NodesAdded);
    2104          49 :       if (!NewSet.empty())
    2105          33 :         NodeSets.push_back(NewSet);
    2106         188 :     }
    2107          16 :   }
    2108             : }
    2109             : 
    2110             : /// Add the node to the set, and add all is its connected nodes to the set.
    2111        2221 : void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
    2112             :                                           SetVector<SUnit *> &NodesAdded) {
    2113        1845 :   NewSet.insert(SU);
    2114             :   NodesAdded.insert(SU);
    2115           9 :   for (auto &SI : SU->Succs) {
    2116           9 :     SUnit *Successor = SI.getSUnit();
    2117           9 :     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
    2118             :       addConnectedNodes(Successor, NewSet, NodesAdded);
    2119             :   }
    2120         188 :   for (auto &PI : SU->Preds) {
    2121             :     SUnit *Predecessor = PI.getSUnit();
    2122             :     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
    2123         545 :       addConnectedNodes(Predecessor, NewSet, NodesAdded);
    2124             :   }
    2125         545 : }
    2126         545 : 
    2127        1219 : /// Return true if Set1 contains elements in Set2. The elements in common
    2128             : /// are returned in a different container.
    2129         674 : static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
    2130         196 :                         SmallSetVector<SUnit *, 8> &Result) {
    2131             :   Result.clear();
    2132        1254 :   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
    2133             :     SUnit *SU = Set1[i];
    2134         709 :     if (Set2.count(SU) != 0)
    2135         166 :       Result.insert(SU);
    2136             :   }
    2137         545 :   return !Result.empty();
    2138             : }
    2139             : 
    2140             : /// Merge the recurrence node sets that have the same initial node.
    2141         306 : void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
    2142             :   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
    2143         306 :        ++I) {
    2144         628 :     NodeSet &NI = *I;
    2145         644 :     for (NodeSetType::iterator J = I + 1; J != E;) {
    2146             :       NodeSet &NJ = *J;
    2147          63 :       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
    2148             :         if (NJ.compareRecMII(NI) > 0)
    2149         306 :           NI.setRecMII(NJ.getRecMII());
    2150             :         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
    2151             :              ++NII)
    2152             :           I->insert(*NII);
    2153           0 :         NodeSets.erase(J);
    2154           0 :         E = NodeSets.end();
    2155             :       } else {
    2156             :         ++J;
    2157           0 :       }
    2158             :     }
    2159           0 :   }
    2160           0 : }
    2161             : 
    2162           0 : /// Remove nodes that have been scheduled in previous NodeSets.
    2163             : void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
    2164           0 :   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
    2165           0 :        ++I)
    2166             :     for (NodeSetType::iterator J = I + 1; J != E;) {
    2167             :       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
    2168           0 : 
    2169             :       if (J->empty()) {
    2170             :         NodeSets.erase(J);
    2171             :         E = NodeSets.end();
    2172           0 :       } else {
    2173             :         ++J;
    2174             :       }
    2175           0 :     }
    2176           0 : }
    2177             : 
    2178           0 : /// Compute an ordered list of the dependence graph nodes, which
    2179        2960 : /// indicates the order that the nodes will be scheduled.  This is a
    2180             : /// two-level algorithm. First, a partial order is created, which
    2181           0 : /// consists of a list of sets ordered from highest to lowest priority.
    2182           0 : void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
    2183             :   SmallSetVector<SUnit *, 8> R;
    2184             :   NodeOrder.clear();
    2185           0 : 
    2186             :   for (auto &Nodes : NodeSets) {
    2187             :     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
    2188           0 :     OrderKind Order;
    2189             :     SmallSetVector<SUnit *, 8> N;
    2190             :     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
    2191             :       R.insert(N.begin(), N.end());
    2192             :       Order = BottomUp;
    2193             :       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
    2194         188 :     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
    2195             :       R.insert(N.begin(), N.end());
    2196         188 :       Order = TopDown;
    2197             :       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
    2198         693 :     } else if (isIntersect(N, Nodes, R)) {
    2199             :       // If some of the successors are in the existing node-set, then use the
    2200             :       // top-down ordering.
    2201             :       Order = TopDown;
    2202         505 :       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
    2203          95 :     } else if (NodeSets.size() == 1) {
    2204             :       for (auto &N : Nodes)
    2205             :         if (N->Succs.size() == 0)
    2206         410 :           R.insert(N);
    2207         104 :       Order = BottomUp;
    2208             :       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
    2209             :     } else {
    2210         306 :       // Find the node with the highest ASAP.
    2211             :       SUnit *maxASAP = nullptr;
    2212             :       for (SUnit *SU : Nodes) {
    2213             :         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
    2214             :             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
    2215         265 :           maxASAP = SU;
    2216         179 :       }
    2217         169 :       R.insert(maxASAP);
    2218          13 :       Order = BottomUp;
    2219             :       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
    2220             :     }
    2221             : 
    2222             :     while (!R.empty()) {
    2223         255 :       if (Order == TopDown) {
    2224         936 :         // Choose the node with the maximum height.  If more than one, choose
    2225         681 :         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
    2226         211 :         // choose the node with the lowest MOV.
    2227         625 :         while (!R.empty()) {
    2228             :           SUnit *maxHeight = nullptr;
    2229         255 :           for (SUnit *I : R) {
    2230             :             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
    2231             :               maxHeight = I;
    2232             :             else if (getHeight(I) == getHeight(maxHeight) &&
    2233             :                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
    2234        1029 :               maxHeight = I;
    2235         524 :             else if (getHeight(I) == getHeight(maxHeight) &&
    2236             :                      getZeroLatencyHeight(I) ==
    2237             :                          getZeroLatencyHeight(maxHeight) &&
    2238             :                      getMOV(I) < getMOV(maxHeight))
    2239         723 :               maxHeight = I;
    2240         567 :           }
    2241        2044 :           NodeOrder.insert(maxHeight);
    2242        2422 :           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
    2243         703 :           R.remove(maxHeight);
    2244         774 :           for (const auto &I : maxHeight->Succs) {
    2245         968 :             if (Nodes.count(I.getSUnit()) == 0)
    2246          33 :               continue;
    2247        1192 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2248         451 :               continue;
    2249         741 :             if (ignoreDependence(I, false))
    2250             :               continue;
    2251          19 :             R.insert(I.getSUnit());
    2252             :           }
    2253         567 :           // Back-edges are predecessors with an anti-dependence.
    2254             :           for (const auto &I : maxHeight->Preds) {
    2255         567 :             if (I.getKind() != SDep::Anti)
    2256        1252 :               continue;
    2257             :             if (Nodes.count(I.getSUnit()) == 0)
    2258         155 :               continue;
    2259         530 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2260             :               continue;
    2261             :             R.insert(I.getSUnit());
    2262             :           }
    2263         421 :         }
    2264             :         Order = BottomUp;
    2265             :         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
    2266        1415 :         SmallSetVector<SUnit *, 8> N;
    2267         848 :         if (pred_L(NodeOrder, N, &Nodes))
    2268             :           R.insert(N.begin(), N.end());
    2269             :       } else {
    2270           6 :         // Choose the node with the maximum depth.  If more than one, choose
    2271          60 :         // the node with the maximum ZeroLatencyDepth. If still more than one,
    2272             :         // choose the node with the lowest MOV.
    2273          58 :         while (!R.empty()) {
    2274             :           SUnit *maxDepth = nullptr;
    2275             :           for (SUnit *I : R) {
    2276             :             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
    2277             :               maxDepth = I;
    2278             :             else if (getDepth(I) == getDepth(maxDepth) &&
    2279         156 :                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
    2280           8 :               maxDepth = I;
    2281             :             else if (getDepth(I) == getDepth(maxDepth) &&
    2282             :                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
    2283             :                      getMOV(I) < getMOV(maxDepth))
    2284             :               maxDepth = I;
    2285        1645 :           }
    2286        1278 :           NodeOrder.insert(maxDepth);
    2287        4354 :           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
    2288        4882 :           R.remove(maxDepth);
    2289        1635 :           if (Nodes.isExceedSU(maxDepth)) {
    2290        1441 :             Order = TopDown;
    2291        1608 :             R.clear();
    2292          48 :             R.insert(Nodes.getNode(0));
    2293        2149 :             break;
    2294        1393 :           }
    2295             :           for (const auto &I : maxDepth->Preds) {
    2296         172 :             if (Nodes.count(I.getSUnit()) == 0)
    2297             :               continue;
    2298        1278 :             if (NodeOrder.count(I.getSUnit()) != 0)
    2299             :               continue;
    2300        1278 :             R.insert(I.getSUnit());
    2301        1278 :           }
    2302             :           // Back-edges are predecessors with an anti-dependence.
    2303             :           for (const auto &I : maxDepth->Succs) {
    2304           1 :             if (I.getKind() != SDep::Anti)
    2305           1 :               continue;
    2306             :             if (Nodes.count(I.getSUnit()) == 0)
    2307        3030 :               continue;
    2308             :             if (NodeOrder.count(I.getSUnit()) != 0)
    2309         429 :               continue;
    2310        1324 :             R.insert(I.getSUnit());
    2311             :           }
    2312        1236 :         }
    2313             :         Order = TopDown;
    2314             :         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
    2315        3194 :         SmallSetVector<SUnit *, 8> N;
    2316        1917 :         if (succ_L(NodeOrder, N, &Nodes))
    2317             :           R.insert(N.begin(), N.end());
    2318             :       }
    2319          38 :     }
    2320         263 :     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
    2321             :   }
    2322          47 : 
    2323             :   LLVM_DEBUG({
    2324             :     dbgs() << "Node order: ";
    2325             :     for (SUnit *I : NodeOrder)
    2326             :       dbgs() << " " << I->NodeNum << " ";
    2327             :     dbgs() << "\n";
    2328         368 :   });
    2329          11 : }
    2330             : 
    2331             : /// Process the nodes in the computed order and create the pipelined schedule
    2332             : /// of the instructions, if possible. Return true if a schedule is found.
    2333             : bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
    2334             :   if (NodeOrder.empty())
    2335             :     return false;
    2336             : 
    2337             :   bool scheduleFound = false;
    2338             :   // Keep increasing II until a valid schedule is found.
    2339             :   for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
    2340             :     Schedule.reset();
    2341         188 :     Schedule.setInitiationInterval(II);
    2342             :     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
    2343             : 
    2344             :     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
    2345         188 :     SetVector<SUnit *>::iterator NE = NodeOrder.end();
    2346         188 :     do {
    2347             :       SUnit *SU = *NI;
    2348             : 
    2349             :       // Compute the schedule time for the instruction, which is based
    2350             :       // upon the scheduled time for any predecessors/successors.
    2351         452 :       int EarlyStart = INT_MIN;
    2352         272 :       int LateStart = INT_MAX;
    2353         272 :       // These values are set when the size of the schedule window is limited
    2354             :       // due to chain dependences.
    2355             :       int SchedEnd = INT_MAX;
    2356             :       int SchedStart = INT_MIN;
    2357             :       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
    2358             :                             II, this);
    2359        3030 :       LLVM_DEBUG({
    2360             :         dbgs() << "Inst (" << SU->NodeNum << ") ";
    2361             :         SU->getInstr()->dump();
    2362             :         dbgs() << "\n";
    2363        3030 :       });
    2364        3030 :       LLVM_DEBUG({
    2365             :         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
    2366             :                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
    2367        3030 :       });
    2368        3030 : 
    2369        3030 :       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
    2370             :           SchedStart > LateStart)
    2371             :         scheduleFound = false;
    2372             :       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
    2373             :         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
    2374             :         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
    2375             :       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
    2376             :         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
    2377             :         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
    2378             :       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
    2379             :         SchedEnd =
    2380             :             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
    2381        3030 :         // When scheduling a Phi it is better to start at the late cycle and go
    2382        2982 :         // backwards. The default order may insert the Phi too far away from
    2383             :         // its first dependence.
    2384        2952 :         if (SU->getInstr()->isPHI())
    2385         698 :           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
    2386         698 :         else
    2387        2254 :           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
    2388        1498 :       } else {
    2389        1498 :         int FirstCycle = Schedule.getFirstCycle();
    2390         756 :         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
    2391         418 :                                         FirstCycle + getASAP(SU) + II - 1, II);
    2392         418 :       }
    2393             :       // Even if we find a schedule, make sure the schedule doesn't exceed the
    2394             :       // allowable number of stages. We keep trying if this happens.
    2395             :       if (scheduleFound)
    2396         418 :         if (SwpMaxStages > -1 &&
    2397         335 :             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
    2398             :           scheduleFound = false;
    2399          83 : 
    2400             :       LLVM_DEBUG({
    2401         338 :         if (!scheduleFound)
    2402         338 :           dbgs() << "\tCan't schedule\n";
    2403         676 :       });
    2404             :     } while (++NI != NE && scheduleFound);
    2405             : 
    2406             :     // If a schedule is found, check if it is a valid schedule too.
    2407        2952 :     if (scheduleFound)
    2408        2940 :       scheduleFound = Schedule.isValidSchedule(this);
    2409        2940 :   }
    2410             : 
    2411             :   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
    2412             : 
    2413             :   if (scheduleFound)
    2414             :     Schedule.finalizeSchedule(this);
    2415             :   else
    2416        3030 :     Schedule.reset();
    2417             : 
    2418             :   return scheduleFound && Schedule.getMaxStageCount() > 0;
    2419         272 : }
    2420         175 : 
    2421             : /// Given a schedule for the loop, generate a new version of the loop,
    2422             : /// and replace the old version.  This function generates a prolog
    2423             : /// that contains the initial iterations in the pipeline, and kernel
    2424             : /// loop, and the epilogue that contains the code for the final
    2425         180 : /// iterations.
    2426         175 : void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
    2427             :   // Create a new basic block for the kernel and add it to the CFG.
    2428           5 :   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
    2429             : 
    2430         180 :   unsigned MaxStageCount = Schedule.getMaxStageCount();
    2431             : 
    2432             :   // Remember the registers that are used in different stages. The index is
    2433             :   // the iteration, or stage, that the instruction is scheduled in.  This is
    2434             :   // a map between register names in the original block and the names created
    2435             :   // in each stage of the pipelined loop.
    2436             :   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
    2437             :   InstrMapTy InstrMap;
    2438         125 : 
    2439             :   SmallVector<MachineBasicBlock *, 4> PrologBBs;
    2440         125 :   // Generate the prolog instructions that set up the pipeline.
    2441             :   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
    2442             :   MF.insert(BB->getIterator(), KernelBB);
    2443             : 
    2444             :   // Rearrange the instructions to generate the new, pipelined loop,
    2445             :   // and update register names as needed.
    2446             :   for (int Cycle = Schedule.getFirstCycle(),
    2447             :            LastCycle = Schedule.getFinalCycle();
    2448         715 :        Cycle <= LastCycle; ++Cycle) {
    2449             :     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
    2450             :     // This inner loop schedules each instruction in the cycle.
    2451             :     for (SUnit *CI : CycleInstrs) {
    2452             :       if (CI->getInstr()->isPHI())
    2453         125 :         continue;
    2454         125 :       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
    2455             :       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
    2456             :       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
    2457             :       KernelBB->push_back(NewMI);
    2458         347 :       InstrMap[NewMI] = CI->getInstr();
    2459         125 :     }
    2460         472 :   }
    2461         347 : 
    2462             :   // Copy any terminator instructions to the new kernel, and update
    2463        1602 :   // names as needed.
    2464        1255 :   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
    2465         268 :                                    E = BB->instr_end();
    2466         987 :        I != E; ++I) {
    2467         987 :     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
    2468         987 :     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
    2469         987 :     KernelBB->push_back(NewMI);
    2470         987 :     InstrMap[NewMI] = &*I;
    2471             :   }
    2472             : 
    2473             :   KernelBB->transferSuccessors(BB);
    2474             :   KernelBB->replaceSuccessor(BB, KernelBB);
    2475             : 
    2476         125 :   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
    2477         125 :                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
    2478         375 :   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
    2479         250 :                InstrMap, MaxStageCount, MaxStageCount, false);
    2480         250 : 
    2481         250 :   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
    2482         250 : 
    2483             :   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
    2484             :   // Generate the epilog instructions to complete the pipeline.
    2485         125 :   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
    2486         125 :                  PrologBBs);
    2487             : 
    2488         125 :   // We need this step because the register allocation doesn't handle some
    2489             :   // situations well, so we insert copies to help out.
    2490         125 :   splitLifetimes(KernelBB, EpilogBBs, Schedule);
    2491             : 
    2492             :   // Remove dead instructions due to loop induction variables.
    2493             :   removeDeadInstructions(KernelBB, EpilogBBs);
    2494             : 
    2495             :   // Add branches between prolog and epilog blocks.
    2496             :   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
    2497         125 : 
    2498             :   // Remove the original loop since it's no longer referenced.
    2499             :   for (auto &I : *BB)
    2500             :     LIS.RemoveMachineInstrFromMaps(I);
    2501             :   BB->clear();
    2502         125 :   BB->eraseFromParent();
    2503             : 
    2504             :   delete[] VRMap;
    2505         125 : }
    2506             : 
    2507             : /// Generate the pipeline prolog code.
    2508         125 : void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
    2509             :                                        MachineBasicBlock *KernelBB,
    2510             :                                        ValueMapTy *VRMap,
    2511        1630 :                                        MBBVectorTy &PrologBBs) {
    2512        1505 :   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
    2513         125 :   assert(PreheaderBB != nullptr &&
    2514         125 :          "Need to add code to handle loops w/o preheader");
    2515             :   MachineBasicBlock *PredBB = PreheaderBB;
    2516         250 :   InstrMapTy InstrMap;
    2517         125 : 
    2518             :   // Generate a basic block for each stage, not including the last stage,
    2519             :   // which will be generated in the kernel. Each basic block may contain
    2520         125 :   // instructions from multiple stages/iterations.
    2521             :   for (unsigned i = 0; i < LastStage; ++i) {
    2522             :     // Create and insert the prolog basic block prior to the original loop
    2523             :     // basic block.  The original loop is removed later.
    2524         250 :     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
    2525             :     PrologBBs.push_back(NewBB);
    2526             :     MF.insert(BB->getIterator(), NewBB);
    2527             :     NewBB->transferSuccessors(PredBB);
    2528             :     PredBB->addSuccessor(NewBB);
    2529             :     PredBB = NewBB;
    2530             : 
    2531             :     // Generate instructions for each appropriate stage. Process instructions
    2532             :     // in original program order.
    2533         295 :     for (int StageNum = i; StageNum >= 0; --StageNum) {
    2534             :       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
    2535             :                                        BBE = BB->getFirstTerminator();
    2536         170 :            BBI != BBE; ++BBI) {
    2537         170 :         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
    2538         170 :           if (BBI->isPHI())
    2539         170 :             continue;
    2540         170 :           MachineInstr *NewMI =
    2541         170 :               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
    2542             :           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
    2543             :                             VRMap);
    2544             :           NewBB->push_back(NewMI);
    2545         394 :           InstrMap[NewMI] = &*BBI;
    2546         224 :         }
    2547         224 :       }
    2548        2659 :     }
    2549        4870 :     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
    2550             :     LLVM_DEBUG({
    2551         259 :       dbgs() << "prolog:\n";
    2552             :       NewBB->dump();
    2553         776 :     });
    2554         776 :   }
    2555             : 
    2556         776 :   PredBB->replaceSuccessor(BB, KernelBB);
    2557         776 : 
    2558             :   // Check if we need to remove the branch from the preheader to the original
    2559             :   // loop, and replace it with a branch to the new loop.
    2560             :   unsigned numBranches = TII->removeBranch(*PreheaderBB);
    2561         170 :   if (numBranches) {
    2562             :     SmallVector<MachineOperand, 0> Cond;
    2563             :     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
    2564             :   }
    2565             : }
    2566             : 
    2567             : /// Generate the pipeline epilog code. The epilog code finishes the iterations
    2568         125 : /// that were started in either the prolog or the kernel.  We create a basic
    2569             : /// block for each stage that needs to complete.
    2570             : void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
    2571             :                                        MachineBasicBlock *KernelBB,
    2572         125 :                                        ValueMapTy *VRMap,
    2573         125 :                                        MBBVectorTy &EpilogBBs,
    2574             :                                        MBBVectorTy &PrologBBs) {
    2575          70 :   // We need to change the branch from the kernel to the first epilog block, so
    2576             :   // this call to analyze branch uses the kernel rather than the original BB.
    2577         125 :   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
    2578             :   SmallVector<MachineOperand, 4> Cond;
    2579             :   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
    2580             :   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
    2581             :   if (checkBranch)
    2582         125 :     return;
    2583             : 
    2584             :   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
    2585             :   if (*LoopExitI == KernelBB)
    2586             :     ++LoopExitI;
    2587             :   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
    2588             :   MachineBasicBlock *LoopExitBB = *LoopExitI;
    2589         125 : 
    2590             :   MachineBasicBlock *PredBB = KernelBB;
    2591         125 :   MachineBasicBlock *EpilogStart = LoopExitBB;
    2592             :   InstrMapTy InstrMap;
    2593         125 : 
    2594             :   // Generate a basic block for each stage, not including the last stage,
    2595             :   // which was generated for the kernel.  Each basic block may contain
    2596             :   // instructions from multiple stages/iterations.
    2597         125 :   int EpilogStage = LastStage + 1;
    2598             :   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
    2599             :     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
    2600         125 :     EpilogBBs.push_back(NewBB);
    2601             :     MF.insert(BB->getIterator(), NewBB);
    2602             : 
    2603             :     PredBB->replaceSuccessor(LoopExitBB, NewBB);
    2604             :     NewBB->addSuccessor(LoopExitBB);
    2605             : 
    2606             :     if (EpilogStart == LoopExitBB)
    2607             :       EpilogStart = NewBB;
    2608             : 
    2609         125 :     // Add instructions to the epilog depending on the current block.
    2610         295 :     // Process instructions in original program order.
    2611         170 :     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
    2612         170 :       for (auto &BBI : *BB) {
    2613         170 :         if (BBI.isPHI())
    2614             :           continue;
    2615         170 :         MachineInstr *In = &BBI;
    2616         170 :         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
    2617             :           // Instructions with memoperands in the epilog are updated with
    2618         170 :           // conservative values.
    2619         125 :           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
    2620             :           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
    2621             :           NewBB->push_back(NewMI);
    2622             :           InstrMap[NewMI] = In;
    2623         394 :         }
    2624        3107 :       }
    2625             :     }
    2626             :     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
    2627             :                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
    2628        2353 :     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
    2629             :                  InstrMap, LastStage, EpilogStage, i == 1);
    2630             :     PredBB = NewBB;
    2631         643 : 
    2632         643 :     LLVM_DEBUG({
    2633         643 :       dbgs() << "epilog:\n";
    2634         643 :       NewBB->dump();
    2635             :     });
    2636             :   }
    2637             : 
    2638         340 :   // Fix any Phi nodes in the loop exit block.
    2639             :   for (MachineInstr &MI : *LoopExitBB) {
    2640         170 :     if (!MI.isPHI())
    2641             :       break;
    2642         170 :     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
    2643             :       MachineOperand &MO = MI.getOperand(i);
    2644             :       if (MO.getMBB() == BB)
    2645             :         MO.setMBB(PredBB);
    2646             :     }
    2647             :   }
    2648             : 
    2649             :   // Create a branch to the new epilog from the kernel.
    2650             :   // Remove the original branch and add a new branch to the epilog.
    2651         139 :   TII->removeBranch(*KernelBB);
    2652             :   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
    2653             :   // Add a branch to the loop exit.
    2654          45 :   if (EpilogBBs.size() > 0) {
    2655          31 :     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
    2656          31 :     SmallVector<MachineOperand, 4> Cond1;
    2657             :     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
    2658             :   }
    2659             : }
    2660             : 
    2661             : /// Replace all uses of FromReg that appear outside the specified
    2662             : /// basic block with ToReg.
    2663         125 : static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
    2664         250 :                                     MachineBasicBlock *MBB,
    2665             :                                     MachineRegisterInfo &MRI,
    2666         250 :                                     LiveIntervals &LIS) {
    2667         125 :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
    2668             :                                          E = MRI.use_end();
    2669         250 :        I != E;) {
    2670             :     MachineOperand &O = *I;
    2671             :     ++I;
    2672             :     if (O.getParent()->getParent() != MBB)
    2673             :       O.setReg(ToReg);
    2674             :   }
    2675         960 :   if (!LIS.hasInterval(ToReg))
    2676             :     LIS.createEmptyInterval(ToReg);
    2677             : }
    2678             : 
    2679         960 : /// Return true if the register has a use that occurs outside the
    2680             : /// specified loop.
    2681        2324 : static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
    2682             :                             MachineRegisterInfo &MRI) {
    2683             :   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
    2684        1364 :                                          E = MRI.use_end();
    2685          70 :        I != E; ++I)
    2686             :     if (I->getParent()->getParent() != BB)
    2687             :       return true;
    2688         960 :   return false;
    2689         960 : }
    2690             : 
    2691             : /// Generate Phis for the specific block in the generated pipelined code.
    2692             : /// This function looks at the Phis from the original code to guide the
    2693         370 : /// creation of new Phis.
    2694             : void SwingSchedulerDAG::generateExistingPhis(
    2695         370 :     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
    2696             :     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
    2697         780 :     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
    2698         418 :     bool IsLast) {
    2699             :   // Compute the stage number for the initial value of the Phi, which
    2700             :   // comes from the prolog. The prolog to use depends on to which kernel/
    2701             :   // epilog that we're adding the Phi.
    2702             :   unsigned PrologStage = 0;
    2703             :   unsigned PrevStage = 0;
    2704             :   bool InKernel = (LastStageNum == CurStageNum);
    2705             :   if (InKernel) {
    2706         295 :     PrologStage = LastStageNum - 1;
    2707             :     PrevStage = CurStageNum;
    2708             :   } else {
    2709             :     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
    2710             :     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
    2711             :   }
    2712             : 
    2713             :   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
    2714             :                                    BBE = BB->getFirstNonPHI();
    2715             :        BBI != BBE; ++BBI) {
    2716         295 :     unsigned Def = BBI->getOperand(0).getReg();
    2717         295 : 
    2718         125 :     unsigned InitVal = 0;
    2719             :     unsigned LoopVal = 0;
    2720             :     getPhiRegs(*BBI, BB, InitVal, LoopVal);
    2721         170 : 
    2722         170 :     unsigned PhiOp1 = 0;
    2723             :     // The Phi value from the loop body typically is defined in the loop, but
    2724             :     // not always. So, we need to check if the value is defined in the loop.
    2725         295 :     unsigned PhiOp2 = LoopVal;
    2726         295 :     if (VRMap[LastStageNum].count(LoopVal))
    2727         953 :       PhiOp2 = VRMap[LastStageNum][LoopVal];
    2728         658 : 
    2729             :     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
    2730             :     int LoopValStage =
    2731         658 :         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
    2732         658 :     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
    2733             :     if (NumStages == 0) {
    2734         658 :       // We don't need to generate a Phi anymore, but we need to rename any uses
    2735             :       // of the Phi value.
    2736             :       unsigned NewReg = VRMap[PrevStage][LoopVal];
    2737         658 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
    2738         658 :                             Def, InitVal, NewReg);
    2739         650 :       if (VRMap[CurStageNum].count(LoopVal))
    2740             :         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
    2741         658 :     }
    2742             :     // Adjust the number of Phis needed depending on the number of prologs left,
    2743         658 :     // and the distance from where the Phi is first scheduled. The number of
    2744         658 :     // Phis cannot exceed the number of prolog stages. Each stage can
    2745         658 :     // potentially define two values.
    2746             :     unsigned MaxPhis = PrologStage + 2;
    2747             :     if (!InKernel && (int)PrologStage <= LoopValStage)
    2748           5 :       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
    2749           5 :     unsigned NumPhis = std::min(NumStages, MaxPhis);
    2750             : 
    2751           5 :     unsigned NewReg = 0;
    2752           5 :     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
    2753             :     // In the epilog, we may need to look back one stage to get the correct
    2754             :     // Phi name because the epilog and prolog blocks execute the same stage.
    2755             :     // The correct name is from the previous block only when the Phi has
    2756             :     // been completely scheduled prior to the epilog, and Phi value is not
    2757             :     // needed in multiple stages.
    2758         658 :     int StageDiff = 0;
    2759         658 :     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
    2760         366 :         NumPhis == 1)
    2761         658 :       StageDiff = 1;
    2762             :     // Adjust the computations below when the phi and the loop definition
    2763             :     // are scheduled in different stages.
    2764         658 :     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
    2765             :       StageDiff = StageScheduled - LoopValStage;
    2766             :     for (unsigned np = 0; np < NumPhis; ++np) {
    2767             :       // If the Phi hasn't been scheduled, then use the initial Phi operand
    2768             :       // value. Otherwise, use the scheduled version of the instruction. This
    2769             :       // is a little complicated when a Phi references another Phi.
    2770             :       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
    2771         658 :         PhiOp1 = InitVal;
    2772         368 :       // Check if the Phi has already been scheduled in a prolog stage.
    2773             :       else if (PrologStage >= AccessStage + StageDiff + np &&
    2774             :                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
    2775             :         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
    2776         658 :       // Check if the Phi has already been scheduled, but the loop intruction
    2777           1 :       // is either another Phi, or doesn't occur in the loop.
    2778        1376 :       else if (PrologStage >= AccessStage + StageDiff + np) {
    2779             :         // If the Phi references another Phi, we need to examine the other
    2780             :         // Phi to get the correct value.
    2781             :         PhiOp1 = LoopVal;
    2782         718 :         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
    2783         228 :         int Indirects = 1;
    2784             :         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
    2785         490 :           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
    2786         307 :           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
    2787         279 :             PhiOp1 = getInitPhiReg(*InstOp1, BB);
    2788             :           else
    2789             :             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
    2790         211 :           InstOp1 = MRI.getVRegDef(PhiOp1);
    2791             :           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
    2792             :           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
    2793          28 :           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
    2794          28 :               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
    2795             :             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
    2796          75 :             break;
    2797          28 :           }
    2798          28 :           ++Indirects;
    2799          36 :         }
    2800             :       } else
    2801          20 :         PhiOp1 = InitVal;
    2802          28 :       // If this references a generated Phi in the kernel, get the Phi operand
    2803          18 :       // from the incoming block.
    2804          10 :       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
    2805          28 :         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
    2806          10 :           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
    2807          10 : 
    2808          10 :       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
    2809             :       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
    2810          18 :       // In the epilog, a map lookup is needed to get the value from the kernel,
    2811             :       // or previous epilog block. How is does this depends on if the
    2812             :       // instruction is scheduled in the previous block.
    2813         183 :       if (!InKernel) {
    2814             :         int StageDiffAdj = 0;
    2815             :         if (LoopValStage != -1 && StageScheduled > LoopValStage)
    2816         718 :           StageDiffAdj = StageScheduled - LoopValStage;
    2817          68 :         // Use the loop value defined in the kernel, unless the kernel
    2818           8 :         // contains the last definition of the Phi.
    2819             :         if (np == 0 && PrevStage == LastStageNum &&
    2820         718 :             (StageScheduled != 0 || LoopValStage != 0) &&
    2821         718 :             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
    2822             :           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
    2823             :         // Use the value defined by the Phi. We add one because we switch
    2824             :         // from looking at the loop value to the Phi definition.
    2825         718 :         else if (np > 0 && PrevStage == LastStageNum &&
    2826             :                  VRMap[PrevStage - np + 1].count(Def))
    2827         428 :           PhiOp2 = VRMap[PrevStage - np + 1][Def];
    2828           3 :         // Use the loop value defined in the kernel.
    2829             :         else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
    2830             :                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
    2831         696 :           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
    2832         428 :         // Use the value defined by the Phi, unless we're generating the first
    2833         104 :         // epilog and the Phi refers to a Phi in a different stage.
    2834         104 :         else if (VRMap[PrevStage - np].count(Def) &&
    2835             :                  (!LoopDefIsPhi || PrevStage != LastStageNum))
    2836             :           PhiOp2 = VRMap[PrevStage - np][Def];
    2837         324 :       }
    2838          27 : 
    2839          27 :       // Check if we can reuse an existing Phi. This occurs when a Phi
    2840             :       // references another Phi, and the other Phi is scheduled in an
    2841         297 :       // earlier stage. We can try to reuse an existing Phi up until the last
    2842          38 :       // stage of the current Phi.
    2843          38 :       if (LoopDefIsPhi) {
    2844             :         if (static_cast<int>(PrologStage - np) >= StageScheduled) {
    2845             :           int LVNumStages = Schedule.getStagesForPhi(LoopVal);
    2846         259 :           int StageDiff = (StageScheduled - LoopValStage);
    2847         259 :           LVNumStages -= StageDiff;
    2848         244 :           // Make sure the loop value Phi has been processed already.
    2849             :           if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
    2850             :             NewReg = PhiOp2;
    2851             :             unsigned ReuseStage = CurStageNum;
    2852             :             if (Schedule.isLoopCarried(this, *PhiInst))
    2853             :               ReuseStage -= LVNumStages;
    2854             :             // Check if the Phi to reuse has been generated yet. If not, then
    2855         718 :             // there is nothing to reuse.
    2856          51 :             if (VRMap[ReuseStage - np].count(LoopVal)) {
    2857          42 :               NewReg = VRMap[ReuseStage - np][LoopVal];
    2858          42 : 
    2859          42 :               rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    2860             :                                     &*BBI, Def, NewReg);
    2861          42 :               // Update the map with the new Phi name.
    2862             :               VRMap[CurStageNum - np][Def] = NewReg;
    2863             :               PhiOp2 = NewReg;
    2864           0 :               if (VRMap[LastStageNum - np - 1].count(LoopVal))
    2865           0 :                 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
    2866             : 
    2867             :               if (IsLast && np == NumPhis - 1)
    2868           0 :                 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    2869           0 :               continue;
    2870             :             }
    2871           0 :           }
    2872             :         }
    2873             :         if (InKernel && StageDiff > 0 &&
    2874           0 :             VRMap[CurStageNum - StageDiff - np].count(LoopVal))
    2875             :           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
    2876           0 :       }
    2877           0 : 
    2878             :       const TargetRegisterClass *RC = MRI.getRegClass(Def);
    2879           0 :       NewReg = MRI.createVirtualRegister(RC);
    2880           0 : 
    2881           0 :       MachineInstrBuilder NewPhi =
    2882             :           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
    2883             :                   TII->get(TargetOpcode::PHI), NewReg);
    2884             :       NewPhi.addReg(PhiOp1).addMBB(BB1);
    2885          51 :       NewPhi.addReg(PhiOp2).addMBB(BB2);
    2886           1 :       if (np == 0)
    2887           1 :         InstrMap[NewPhi] = &*BBI;
    2888             : 
    2889             :       // We define the Phis after creating the new pipelined code, so
    2890         718 :       // we need to rename the Phi values in scheduled instructions.
    2891         718 : 
    2892             :       unsigned PrevReg = 0;
    2893             :       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
    2894        1436 :         PrevReg = VRMap[PrevStage - np][LoopVal];
    2895         718 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
    2896         718 :                             Def, NewReg, PrevReg);
    2897         718 :       // If the Phi has been scheduled, use the new name for rewriting.
    2898         718 :       if (VRMap[CurStageNum - np].count(Def)) {
    2899         653 :         unsigned R = VRMap[CurStageNum - np][Def];
    2900             :         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
    2901             :                               R, NewReg);
    2902             :       }
    2903             : 
    2904             :       // Check if we need to rename any uses that occurs after the loop. The
    2905         718 :       // register to replace depends on whether the Phi is scheduled in the
    2906         280 :       // epilog.
    2907         718 :       if (IsLast && np == NumPhis - 1)
    2908             :         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    2909             : 
    2910         718 :       // In the kernel, a dependent Phi uses the value from this Phi.
    2911          38 :       if (InKernel)
    2912          38 :         PhiOp2 = NewReg;
    2913             : 
    2914             :       // Update the map with the new Phi name.
    2915             :       VRMap[CurStageNum - np][Def] = NewReg;
    2916             :     }
    2917             : 
    2918             :     while (NumPhis++ < NumStages) {
    2919         718 :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
    2920         268 :                             &*BBI, Def, NewReg, 0);
    2921             :     }
    2922             : 
    2923         718 :     // Check if we need to rename a Phi that has been eliminated due to
    2924             :     // scheduling.
    2925             :     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
    2926             :       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
    2927         718 :   }
    2928             : }
    2929             : 
    2930         668 : /// Generate Phis for the specified block in the generated pipelined code.
    2931          10 : /// These are new Phis needed because the definition is scheduled after the
    2932             : /// use in the pipelined sequence.
    2933             : void SwingSchedulerDAG::generatePhis(
    2934             :     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
    2935             :     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
    2936             :     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
    2937         658 :     bool IsLast) {
    2938           0 :   // Compute the stage number that contains the initial Phi value, and
    2939             :   // the Phi from the previous stage.
    2940         295 :   unsigned PrologStage = 0;
    2941             :   unsigned PrevStage = 0;
    2942             :   unsigned StageDiff = CurStageNum - LastStageNum;
    2943             :   bool InKernel = (StageDiff == 0);
    2944             :   if (InKernel) {
    2945         295 :     PrologStage = LastStageNum - 1;
    2946             :     PrevStage = CurStageNum;
    2947             :   } else {
    2948             :     PrologStage = LastStageNum - StageDiff;
    2949             :     PrevStage = LastStageNum + StageDiff - 1;
    2950             :   }
    2951             : 
    2952             :   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
    2953             :                                    BBE = BB->instr_end();
    2954         295 :        BBI != BBE; ++BBI) {
    2955             :     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
    2956         295 :       MachineOperand &MO = BBI->getOperand(i);
    2957         125 :       if (!MO.isReg() || !MO.isDef() ||
    2958             :           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    2959             :         continue;
    2960         170 : 
    2961         170 :       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
    2962             :       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
    2963             :       unsigned Def = MO.getReg();
    2964         295 :       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
    2965         295 :       // An instruction scheduled in stage 0 and is used after the loop
    2966        3291 :       // requires a phi in the epilog for the last definition from either
    2967       13248 :       // the kernel or prolog.
    2968       10252 :       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
    2969       10252 :           hasUseAfterLoop(Def, BB, MRI))
    2970        3376 :         NumPhis = 1;
    2971        8401 :       if (!InKernel && (unsigned)StageScheduled > PrologStage)
    2972             :         continue;
    2973        2420 : 
    2974             :       unsigned PhiOp2 = VRMap[PrevStage][Def];
    2975        2420 :       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
    2976        2420 :         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
    2977             :           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
    2978             :       // The number of Phis can't exceed the number of prolog stages. The
    2979             :       // prolog stage number is zero based.
    2980        2790 :       if (NumPhis > PrologStage + 1 - StageScheduled)
    2981         370 :         NumPhis = PrologStage + 1 - StageScheduled;
    2982             :       for (unsigned np = 0; np < NumPhis; ++np) {
    2983        2420 :         unsigned PhiOp1 = VRMap[PrologStage][Def];
    2984             :         if (np <= PrologStage)
    2985             :           PhiOp1 = VRMap[PrologStage - np][Def];
    2986        1851 :         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
    2987        1851 :           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
    2988          88 :             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
    2989             :           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
    2990             :             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
    2991             :         }
    2992        1851 :         if (!InKernel)
    2993             :           PhiOp2 = VRMap[PrevStage - np][Def];
    2994        2524 : 
    2995         673 :         const TargetRegisterClass *RC = MRI.getRegClass(Def);
    2996         673 :         unsigned NewReg = MRI.createVirtualRegister(RC);
    2997         673 : 
    2998         673 :         MachineInstrBuilder NewPhi =
    2999         306 :             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
    3000             :                     TII->get(TargetOpcode::PHI), NewReg);
    3001         306 :         NewPhi.addReg(PhiOp1).addMBB(BB1);
    3002             :         NewPhi.addReg(PhiOp2).addMBB(BB2);
    3003             :         if (np == 0)
    3004         673 :           InstrMap[NewPhi] = &*BBI;
    3005         385 : 
    3006             :         // Rewrite uses and update the map. The actions depend upon whether
    3007         673 :         // we generating code for the kernel or epilog blocks.
    3008         673 :         if (InKernel) {
    3009             :           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    3010             :                                 &*BBI, PhiOp1, NewReg);
    3011        1346 :           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    3012         673 :                                 &*BBI, PhiOp2, NewReg);
    3013         673 : 
    3014         673 :           PhiOp2 = NewReg;
    3015         673 :           VRMap[PrevStage - np - 1][Def] = NewReg;
    3016         636 :         } else {
    3017             :           VRMap[CurStageNum - np][Def] = NewReg;
    3018             :           if (np == NumPhis - 1)
    3019             :             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
    3020         673 :                                   &*BBI, Def, NewReg);
    3021         288 :         }
    3022             :         if (IsLast && np == NumPhis - 1)
    3023         288 :           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
    3024             :       }
    3025             :     }
    3026             :   }
    3027         288 : }
    3028             : 
    3029         385 : /// Remove instructions that generate values with no uses.
    3030         385 : /// Typically, these are induction variable operations that generate values
    3031         365 : /// used in the loop itself.  A dead instruction has a definition with
    3032             : /// no uses, or uses that occur in the original loop only.
    3033             : void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
    3034         673 :                                                MBBVectorTy &EpilogBBs) {
    3035         221 :   // For each epilog block, check that the value defined by each instruction
    3036             :   // is used.  If not, delete it.
    3037             :   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
    3038             :                                      MBE = EpilogBBs.rend();
    3039         295 :        MBB != MBE; ++MBB)
    3040             :     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
    3041             :                                                    ME = (*MBB)->instr_rend();
    3042             :          MI != ME;) {
    3043             :       // From DeadMachineInstructionElem. Don't delete inline assembly.
    3044             :       if (MI->isInlineAsm()) {
    3045         125 :         ++MI;
    3046             :         continue;
    3047             :       }
    3048             :       bool SawStore = false;
    3049             :       // Check if it's safe to remove the instruction due to side effects.
    3050             :       // We can, and want to, remove Phis here.
    3051         295 :       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
    3052         170 :         ++MI;
    3053             :         continue;
    3054        1751 :       }
    3055             :       bool used = true;
    3056        1581 :       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
    3057             :                                       MOE = MI->operands_end();
    3058         523 :            MOI != MOE; ++MOI) {
    3059             :         if (!MOI->isReg() || !MOI->isDef())
    3060        1581 :           continue;
    3061             :         unsigned reg = MOI->getReg();
    3062             :         // Assume physical registers are used, unless they are marked dead.
    3063        1581 :         if (TargetRegisterInfo::isPhysicalRegister(reg)) {
    3064             :           used = !MOI->isDead();
    3065         268 :           if (used)
    3066             :             break;
    3067             :           continue;
    3068        1252 :         }
    3069        1313 :         unsigned realUses = 0;
    3070        2565 :         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
    3071        2310 :                                                EI = MRI.use_end();
    3072             :              UI != EI; ++UI) {
    3073        1313 :           // Check if there are any uses that occur only in the original
    3074             :           // loop.  If so, that's not a real use.
    3075        1313 :           if (UI->getParent()->getParent() != BB) {
    3076           0 :             realUses++;
    3077           0 :             used = true;
    3078             :             break;
    3079             :           }
    3080             :         }
    3081             :         if (realUses > 0)
    3082        1313 :           break;
    3083             :         used = false;
    3084        1313 :       }
    3085             :       if (!used) {
    3086             :         LIS.RemoveMachineInstrFromMaps(*MI);
    3087        1058 :         MI++->eraseFromParent();
    3088             :         continue;
    3089             :       }
    3090             :       ++MI;
    3091             :     }
    3092             :   // In the kernel block, check if we can remove a Phi that generates a value
    3093        1313 :   // used in an instruction removed in the epilog block.
    3094             :   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
    3095             :                                    BBE = KernelBB->getFirstNonPHI();
    3096             :        BBI != BBE;) {
    3097        1313 :     MachineInstr *MI = &*BBI;
    3098         255 :     ++BBI;
    3099         255 :     unsigned reg = MI->getOperand(0).getReg();
    3100         255 :     if (MRI.use_begin(reg) == MRI.use_end()) {
    3101             :       LIS.RemoveMachineInstrFromMaps(*MI);
    3102             :       MI->eraseFromParent();
    3103             :     }
    3104             :   }
    3105             : }
    3106             : 
    3107         125 : /// For loop carried definitions, we split the lifetime of a virtual register
    3108         703 : /// that has uses past the definition in the next iteration. A copy with a new
    3109             : /// virtual register is inserted before the definition, which helps with
    3110             : /// generating a better register assignment.
    3111         578 : ///
    3112         578 : ///   v1 = phi(a, v2)     v1 = phi(a, v2)
    3113           0 : ///   v2 = phi(b, v3)     v2 = phi(b, v3)
    3114           0 : ///   v3 = ..             v4 = copy v1
    3115             : ///   .. = V1             v3 = ..
    3116             : ///                       .. = v4
    3117         125 : void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
    3118             :                                        MBBVectorTy &EpilogBBs,
    3119             :                                        SMSchedule &Schedule) {
    3120             :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3121             :   for (auto &PHI : KernelBB->phis()) {
    3122             :     unsigned Def = PHI.getOperand(0).getReg();
    3123             :     // Check for any Phi definition that used as an operand of another Phi
    3124             :     // in the same block.
    3125             :     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
    3126             :                                                  E = MRI.use_instr_end();
    3127             :          I != E; ++I) {
    3128             :       if (I->isPHI() && I->getParent() == KernelBB) {
    3129           0 :         // Get the loop carried definition.
    3130             :         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
    3131             :         if (!LCDef)
    3132           0 :           continue;
    3133           0 :         MachineInstr *MI = MRI.getVRegDef(LCDef);
    3134           0 :         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
    3135             :           continue;
    3136             :         // Search through the rest of the block looking for uses of the Phi
    3137           0 :         // definition. If one occurs, then split the lifetime.
    3138             :         unsigned SplitReg = 0;
    3139           0 :         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
    3140           0 :                                     KernelBB->instr_end()))
    3141             :           if (BBJ.readsRegister(Def)) {
    3142             :             // We split the lifetime when we find the first use.
    3143           0 :             if (SplitReg == 0) {
    3144           0 :               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
    3145           0 :               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
    3146           0 :                       TII->get(TargetOpcode::COPY), SplitReg)
    3147           0 :                   .addReg(Def);
    3148             :             }
    3149             :             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
    3150             :           }
    3151             :         if (!SplitReg)
    3152           0 :           continue;
    3153           0 :         // Search through each of the epilog blocks for any uses to be renamed.
    3154             :         for (auto &Epilog : EpilogBBs)
    3155           0 :           for (auto &I : *Epilog)
    3156           0 :             if (I.readsRegister(Def))
    3157           0 :               I.substituteRegister(Def, SplitReg, 0, *TRI);
    3158           0 :         break;
    3159           0 :       }
    3160             :     }
    3161           0 :   }
    3162             : }
    3163           0 : 
    3164           0 : /// Remove the incoming block from the Phis in a basic block.
    3165             : static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
    3166           0 :   for (MachineInstr &MI : *BB) {
    3167           0 :     if (!MI.isPHI())
    3168           0 :       break;
    3169           0 :     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
    3170             :       if (MI.getOperand(i + 1).getMBB() == Incoming) {
    3171             :         MI.RemoveOperand(i + 1);
    3172             :         MI.RemoveOperand(i);
    3173             :         break;
    3174           0 :       }
    3175             :   }
    3176             : }
    3177          41 : 
    3178         176 : /// Create branches from each prolog basic block to the appropriate epilog
    3179             : /// block.  These edges are needed if the loop ends before reaching the
    3180             : /// kernel.
    3181         161 : void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
    3182         322 :                                     MachineBasicBlock *KernelBB,
    3183         135 :                                     MBBVectorTy &EpilogBBs,
    3184         135 :                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
    3185         135 :   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
    3186             :   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
    3187             :   MachineInstr *Cmp = Pass.LI.LoopCompare;
    3188          41 :   MachineBasicBlock *LastPro = KernelBB;
    3189             :   MachineBasicBlock *LastEpi = KernelBB;
    3190             : 
    3191             :   // Start from the blocks connected to the kernel and work "out"
    3192             :   // to the first prolog and the last epilog blocks.
    3193         125 :   SmallVector<MachineInstr *, 4> PrevInsts;
    3194             :   unsigned MaxIter = PrologBBs.size() - 1;
    3195             :   unsigned LC = UINT_MAX;
    3196             :   unsigned LCMin = UINT_MAX;
    3197             :   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
    3198         125 :     // Add branches to the prolog that go to the corresponding
    3199         125 :     // epilog, and the fall-thru prolog/kernel block.
    3200             :     MachineBasicBlock *Prolog = PrologBBs[j];
    3201             :     MachineBasicBlock *Epilog = EpilogBBs[i];
    3202             :     // We've executed one iteration, so decrement the loop count and check for
    3203             :     // the loop end.
    3204             :     SmallVector<MachineOperand, 4> Cond;
    3205             :     // Check if the LOOP0 has already been removed. If so, then there is no need
    3206         125 :     // to reduce the trip count.
    3207             :     if (LC != 0)
    3208             :       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
    3209         295 :                                 MaxIter);
    3210             : 
    3211             :     // Record the value of the first trip count, which is used to determine if
    3212         170 :     // branches and blocks can be removed for constant trip counts.
    3213         340 :     if (LCMin == UINT_MAX)
    3214             :       LCMin = LC;
    3215             : 
    3216             :     unsigned numAdded = 0;
    3217             :     if (TargetRegisterInfo::isVirtualRegister(LC)) {
    3218             :       Prolog->addSuccessor(Epilog);
    3219         170 :       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
    3220         338 :     } else if (j >= LCMin) {
    3221         169 :       Prolog->addSuccessor(Epilog);
    3222             :       Prolog->removeSuccessor(LastPro);
    3223             :       LastEpi->removeSuccessor(Epilog);
    3224             :       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
    3225         170 :       removePhis(Epilog, LastEpi);
    3226             :       // Remove the blocks that are no longer referenced.
    3227             :       if (LastPro != LastEpi) {
    3228             :         LastEpi->clear();
    3229         170 :         LastEpi->eraseFromParent();
    3230         129 :       }
    3231         258 :       LastPro->clear();
    3232          41 :       LastPro->eraseFromParent();
    3233           5 :     } else {
    3234           5 :       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
    3235           5 :       removePhis(Epilog, Prolog);
    3236          10 :     }
    3237           5 :     LastPro = Prolog;
    3238             :     LastEpi = Epilog;
    3239           5 :     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
    3240             :                                                    E = Prolog->instr_rend();
    3241           1 :          I != E && numAdded > 0; ++I, --numAdded)
    3242             :       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
    3243             :   }
    3244           5 : }
    3245             : 
    3246          72 : /// Return true if we can compute the amount the instruction changes
    3247          36 : /// during each iteration. Set Delta to the amount of the change.
    3248             : bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
    3249             :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3250             :   unsigned BaseReg;
    3251         299 :   int64_t Offset;
    3252             :   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
    3253         469 :     return false;
    3254         299 : 
    3255             :   MachineRegisterInfo &MRI = MF.getRegInfo();
    3256         125 :   // Check if there is a Phi. If so, get the definition in the loop.
    3257             :   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
    3258             :   if (BaseDef && BaseDef->isPHI()) {
    3259             :     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
    3260           0 :     BaseDef = MRI.getVRegDef(BaseReg);
    3261           0 :   }
    3262             :   if (!BaseDef)
    3263             :     return false;
    3264           0 : 
    3265           0 :   int D = 0;
    3266             :   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
    3267           0 :     return false;
    3268             : 
    3269           0 :   Delta = D;
    3270           0 :   return true;
    3271           0 : }
    3272           0 : 
    3273             : /// Update the memory operand with a new offset when the pipeliner
    3274           0 : /// generates a new copy of the instruction that refers to a
    3275           0 : /// different memory location.
    3276             : void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
    3277           0 :                                           MachineInstr &OldMI, unsigned Num) {
    3278           0 :   if (Num == 0)
    3279           0 :     return;
    3280             :   // If the instruction has memory operands, then adjust the offset
    3281           0 :   // when the instruction appears in different stages.
    3282           0 :   if (NewMI.memoperands_empty())
    3283             :     return;
    3284             :   SmallVector<MachineMemOperand *, 2> NewMMOs;
    3285             :   for (MachineMemOperand *MMO : NewMI.memoperands()) {
    3286             :     if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
    3287             :         (!MMO->getValue())) {
    3288        2406 :       NewMMOs.push_back(MMO);
    3289             :       continue;
    3290        2406 :     }
    3291        1956 :     unsigned Delta;
    3292             :     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
    3293             :       int64_t AdjOffset = Delta * Num;
    3294        1419 :       NewMMOs.push_back(
    3295             :           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
    3296             :     } else {
    3297         930 :       NewMMOs.push_back(
    3298        1440 :           MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
    3299             :     }
    3300           0 :   }
    3301           0 :   NewMI.setMemRefs(MF, NewMMOs);
    3302             : }
    3303             : 
    3304         480 : /// Clone the instruction for the new pipelined loop and update the
    3305         192 : /// memory operands, if needed.
    3306         192 : MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
    3307         192 :                                             unsigned CurStageNum,
    3308             :                                             unsigned InstStageNum) {
    3309         288 :   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
    3310         288 :   // Check for tied operands in inline asm instructions. This should be handled
    3311             :   // elsewhere, but I'm not sure of the best solution.
    3312             :   if (OldMI->isInlineAsm())
    3313         450 :     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
    3314             :       const auto &MO = OldMI->getOperand(i);
    3315             :       if (MO.isReg() && MO.isUse())
    3316             :         break;
    3317             :       unsigned UseIdx;
    3318        1630 :       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
    3319             :         NewMI->tieOperands(i, UseIdx);
    3320             :     }
    3321        1630 :   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
    3322             :   return NewMI;
    3323             : }
    3324        1630 : 
    3325           0 : /// Clone the instruction for the new pipelined loop. If needed, this
    3326           0 : /// function updates the instruction using the values saved in the
    3327           0 : /// InstrChanges structure.
    3328             : MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
    3329             :                                                      unsigned CurStageNum,
    3330           0 :                                                      unsigned InstStageNum,
    3331           0 :                                                      SMSchedule &Schedule) {
    3332             :   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
    3333        1630 :   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    3334        1630 :       InstrChanges.find(getSUnit(OldMI));
    3335             :   if (It != InstrChanges.end()) {
    3336             :     std::pair<unsigned, int64_t> RegAndOffset = It->second;
    3337             :     unsigned BasePos, OffsetPos;
    3338             :     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
    3339             :       return nullptr;
    3340         776 :     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
    3341             :     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
    3342             :     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
    3343             :       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
    3344         776 :     NewMI->getOperand(OffsetPos).setImm(NewOffset);
    3345             :   }
    3346        1552 :   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
    3347         776 :   return NewMI;
    3348          26 : }
    3349             : 
    3350          26 : /// Update the machine instruction with new virtual registers.  This
    3351           0 : /// function may change the defintions and/or uses.
    3352          26 : void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
    3353          26 :                                           unsigned CurStageNum,
    3354          26 :                                           unsigned InstrStageNum,
    3355          26 :                                           SMSchedule &Schedule,
    3356          26 :                                           ValueMapTy *VRMap) {
    3357             :   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
    3358         776 :     MachineOperand &MO = NewMI->getOperand(i);
    3359         776 :     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    3360             :       continue;
    3361             :     unsigned reg = MO.getReg();
    3362             :     if (MO.isDef()) {
    3363             :       // Create a new virtual register for the definition.
    3364        2955 :       const TargetRegisterClass *RC = MRI.getRegClass(reg);
    3365             :       unsigned NewReg = MRI.createVirtualRegister(RC);
    3366             :       MO.setReg(NewReg);
    3367             :       VRMap[CurStageNum][reg] = NewReg;
    3368             :       if (LastDef)
    3369       12744 :         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
    3370        9789 :     } else if (MO.isUse()) {
    3371        9789 :       MachineInstr *Def = MRI.getVRegDef(reg);
    3372        3091 :       // Compute the stage that contains the last definition for instruction.
    3373        6698 :       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
    3374        6698 :       unsigned StageNum = CurStageNum;
    3375             :       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
    3376        2420 :         // Compute the difference in stages between the defintion and the use.
    3377        2420 :         unsigned StageDiff = (InstrStageNum - DefStageNum);
    3378        2420 :         // Make an adjustment to get the last definition.
    3379        2420 :         StageNum -= StageDiff;
    3380        2420 :       }
    3381         471 :       if (VRMap[StageNum].count(reg))
    3382             :         MO.setReg(VRMap[StageNum][reg]);
    3383        4278 :     }
    3384             :   }
    3385        4278 : }
    3386             : 
    3387        4278 : /// Return the instruction in the loop that defines the register.
    3388             : /// If the definition is a Phi, then follow the Phi operand to
    3389         466 : /// the instruction in the loop.
    3390             : MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
    3391         466 :   SmallPtrSet<MachineInstr *, 8> Visited;
    3392             :   MachineInstr *Def = MRI.getVRegDef(Reg);
    3393        4278 :   while (Def->isPHI()) {
    3394        1938 :     if (!Visited.insert(Def).second)
    3395             :       break;
    3396             :     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
    3397        2955 :       if (Def->getOperand(i + 1).getMBB() == BB) {
    3398             :         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
    3399             :         break;
    3400             :       }
    3401             :   }
    3402           0 :   return Def;
    3403             : }
    3404           0 : 
    3405             : /// Return the new name for the value from the previous stage.
    3406           0 : unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
    3407             :                                           unsigned LoopVal, unsigned LoopStage,
    3408           0 :                                           ValueMapTy *VRMap,
    3409           0 :                                           MachineBasicBlock *BB) {
    3410           0 :   unsigned PrevVal = 0;
    3411           0 :   if (StageNum > PhiStage) {
    3412             :     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
    3413             :     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
    3414           0 :       // The name is defined in the previous stage.
    3415             :       PrevVal = VRMap[StageNum - 1][LoopVal];
    3416             :     else if (VRMap[StageNum].count(LoopVal))
    3417             :       // The previous name is defined in the current stage when the instruction
    3418         412 :       // order is swapped.
    3419             :       PrevVal = VRMap[StageNum][LoopVal];
    3420             :     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
    3421             :       // The loop value hasn't yet been scheduled.
    3422             :       PrevVal = LoopVal;
    3423         412 :     else if (StageNum == PhiStage + 1)
    3424          69 :       // The loop value is another phi, which has not been scheduled.
    3425          69 :       PrevVal = getInitPhiReg(*LoopInst, BB);
    3426             :     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
    3427          52 :       // The loop value is another phi, which has been scheduled.
    3428          17 :       PrevVal =
    3429             :           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
    3430             :                         LoopStage, VRMap, BB);
    3431           7 :   }
    3432          10 :   return PrevVal;
    3433             : }
    3434           0 : 
    3435          10 : /// Rewrite the Phi values in the specified block to use the mappings
    3436             : /// from the initial operand. Once the Phi is scheduled, we switch
    3437             : /// to using the loop value instead of the Phi value, so those names
    3438           0 : /// do not need to be rewritten.
    3439             : void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
    3440             :                                          unsigned StageNum,
    3441           0 :                                          SMSchedule &Schedule,
    3442             :                                          ValueMapTy *VRMap,
    3443             :                                          InstrMapTy &InstrMap) {
    3444         412 :   for (auto &PHI : BB->phis()) {
    3445             :     unsigned InitVal = 0;
    3446             :     unsigned LoopVal = 0;
    3447             :     getPhiRegs(PHI, BB, InitVal, LoopVal);
    3448             :     unsigned PhiDef = PHI.getOperand(0).getReg();
    3449             : 
    3450             :     unsigned PhiStage =
    3451         170 :         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
    3452             :     unsigned LoopStage =
    3453             :         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
    3454             :     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
    3455             :     if (NumPhis > StageNum)
    3456         730 :       NumPhis = StageNum;
    3457             :     for (unsigned np = 0; np <= NumPhis; ++np) {
    3458             :       unsigned NewVal =
    3459         390 :           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
    3460         390 :       if (!NewVal)
    3461             :         NewVal = InitVal;
    3462             :       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
    3463         390 :                             PhiDef, NewVal);
    3464             :     }
    3465         780 :   }
    3466             : }
    3467         390 : 
    3468             : /// Rewrite a previously scheduled instruction to use the register value
    3469         802 : /// from the new instruction. Make sure the instruction occurs in the
    3470             : /// basic block, and we don't change the uses in the new instruction.
    3471         412 : void SwingSchedulerDAG::rewriteScheduledInstr(
    3472         412 :     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
    3473             :     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
    3474         412 :     unsigned NewReg, unsigned PrevReg) {
    3475             :   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
    3476             :   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
    3477             :   // Rewrite uses that have been scheduled already to use the new
    3478         170 :   // Phi register.
    3479             :   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
    3480             :                                          EI = MRI.use_end();
    3481             :        UI != EI;) {
    3482             :     MachineOperand &UseOp = *UI;
    3483        2124 :     MachineInstr *UseMI = UseOp.getParent();
    3484             :     ++UI;
    3485             :     if (UseMI->getParent() != BB)
    3486             :       continue;
    3487        2124 :     if (UseMI->isPHI()) {
    3488        2124 :       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
    3489             :         continue;
    3490             :       if (getLoopPhiReg(*UseMI, BB) != OldReg)
    3491        2124 :         continue;
    3492             :     }
    3493        7328 :     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
    3494             :     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
    3495        5204 :     SUnit *OrigMISU = getSUnit(OrigInstr->second);
    3496             :     int StageSched = Schedule.stageScheduled(OrigMISU);
    3497        5204 :     int CycleSched = Schedule.cycleScheduled(OrigMISU);
    3498        3346 :     unsigned ReplaceReg = 0;
    3499             :     // This is the stage for the scheduled instruction.
    3500         598 :     if (StagePhi == StageSched && Phi->isPHI()) {
    3501             :       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
    3502          86 :       if (PrevReg && InProlog)
    3503             :         ReplaceReg = PrevReg;
    3504             :       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
    3505        1858 :                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
    3506             :         ReplaceReg = PrevReg;
    3507        1858 :       else
    3508        1858 :         ReplaceReg = NewReg;
    3509        1858 :     }
    3510             :     // The scheduled instruction occurs before the scheduled Phi, and the
    3511             :     // Phi is not loop carried.
    3512        1858 :     if (!InProlog && StagePhi + 1 == StageSched &&
    3513         977 :         !Schedule.isLoopCarried(this, *Phi))
    3514         977 :       ReplaceReg = NewReg;
    3515             :     if (StagePhi > StageSched && Phi->isPHI())
    3516         977 :       ReplaceReg = NewReg;
    3517           0 :     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
    3518             :       ReplaceReg = NewReg;
    3519             :     if (ReplaceReg) {
    3520             :       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
    3521             :       UseOp.setReg(ReplaceReg);
    3522             :     }
    3523             :   }
    3524        2594 : }
    3525         736 : 
    3526             : /// Check if we can change the instruction to use an offset value from the
    3527        1858 : /// previous iteration. If so, return true and set the base and offset values
    3528             : /// so that we can rewrite the load, if necessary.
    3529        2559 : ///   v1 = Phi(v0, v3)
    3530             : ///   v2 = load v1, 0
    3531        1858 : ///   v3 = post_store v1, 4, x
    3532        3392 : /// This function enables the load to be rewritten as v2 = load v3, 4.
    3533        1696 : bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
    3534             :                                               unsigned &BasePos,
    3535             :                                               unsigned &OffsetPos,
    3536        2124 :                                               unsigned &NewBase,
    3537             :                                               int64_t &Offset) {
    3538             :   // Get the load instruction.
    3539             :   if (TII->isPostIncrement(*MI))
    3540             :     return false;
    3541             :   unsigned BasePosLd, OffsetPosLd;
    3542             :   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
    3543             :     return false;
    3544             :   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
    3545           0 : 
    3546             :   // Look for the Phi instruction.
    3547             :   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
    3548             :   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
    3549             :   if (!Phi || !Phi->isPHI())
    3550             :     return false;
    3551           0 :   // Get the register defined in the loop block.
    3552           0 :   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
    3553             :   if (!PrevReg)
    3554           0 :     return false;
    3555           0 : 
    3556           0 :   // Check for the post-increment load/store instruction.
    3557             :   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
    3558             :   if (!PrevDef || PrevDef == MI)
    3559           0 :     return false;
    3560           0 : 
    3561           0 :   if (!TII->isPostIncrement(*PrevDef))
    3562           0 :     return false;
    3563             : 
    3564           0 :   unsigned BasePos1 = 0, OffsetPos1 = 0;
    3565           0 :   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
    3566           0 :     return false;
    3567             : 
    3568             :   // Make sure that the instructions do not access the same memory location in
    3569           0 :   // the next iteration.
    3570           0 :   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
    3571           0 :   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
    3572             :   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    3573           0 :   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
    3574           0 :   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
    3575             :   MF.DeleteMachineInstr(NewMI);
    3576           0 :   if (!Disjoint)
    3577           0 :     return false;
    3578           0 : 
    3579             :   // Set the return value once we determine that we return true.
    3580             :   BasePos = BasePosLd;
    3581             :   OffsetPos = OffsetPosLd;
    3582           0 :   NewBase = PrevReg;
    3583           0 :   Offset = StoreOffset;
    3584           0 :   return true;
    3585           0 : }
    3586           0 : 
    3587           0 : /// Apply changes to the instruction if needed. The changes are need
    3588           0 : /// to improve the scheduling and depend up on the final schedule.
    3589           0 : void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
    3590             :                                          SMSchedule &Schedule) {
    3591             :   SUnit *SU = getSUnit(MI);
    3592           0 :   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    3593           0 :       InstrChanges.find(SU);
    3594           0 :   if (It != InstrChanges.end()) {
    3595           0 :     std::pair<unsigned, int64_t> RegAndOffset = It->second;
    3596           0 :     unsigned BasePos, OffsetPos;
    3597             :     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
    3598             :       return;
    3599             :     unsigned BaseReg = MI->getOperand(BasePos).getReg();
    3600             :     MachineInstr *LoopDef = findDefInLoop(BaseReg);
    3601        1633 :     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
    3602             :     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
    3603             :     int BaseStageNum = Schedule.stageScheduled(SU);
    3604             :     int BaseCycleNum = Schedule.cycleScheduled(SU);
    3605        1633 :     if (BaseStageNum < DefStageNum) {
    3606        1633 :       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    3607          25 :       int OffsetDiff = DefStageNum - BaseStageNum;
    3608             :       if (DefCycleNum < BaseCycleNum) {
    3609          25 :         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
    3610           0 :         if (OffsetDiff > 0)
    3611          25 :           --OffsetDiff;
    3612          25 :       }
    3613          25 :       int64_t NewOffset =
    3614          25 :           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
    3615          25 :       NewMI->getOperand(OffsetPos).setImm(NewOffset);
    3616          25 :       SU->setInstr(NewMI);
    3617          25 :       MISUnitMap[NewMI] = SU;
    3618          19 :       NewMIs.insert(NewMI);
    3619          19 :     }
    3620          19 :   }
    3621           4 : }
    3622             : 
    3623           2 : /// Return true for an order or output dependence that is loop carried
    3624             : /// potentially. A dependence is loop carried if the destination defines a valu
    3625             : /// that may be used or defined by the source in a subsequent iteration.
    3626          19 : bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
    3627          19 :                                          bool isSucc) {
    3628             :   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
    3629          19 :       Dep.isArtificial())
    3630          19 :     return false;
    3631             : 
    3632             :   if (!SwpPruneLoopCarried)
    3633             :     return true;
    3634             : 
    3635             :   if (Dep.getKind() == SDep::Output)
    3636             :     return true;
    3637             : 
    3638        4214 :   MachineInstr *SI = Source->getInstr();
    3639             :   MachineInstr *DI = Dep.getSUnit()->getInstr();
    3640        4214 :   if (!isSucc)
    3641             :     std::swap(SI, DI);
    3642             :   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
    3643             : 
    3644         712 :   // Assume ordered loads and stores may have a loop carried dependence.
    3645             :   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
    3646             :       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
    3647         710 :     return true;
    3648             : 
    3649             :   // Only chain dependences between a load and store can be loop carried.
    3650         710 :   if (!DI->mayStore() || !SI->mayLoad())
    3651         710 :     return false;
    3652         710 : 
    3653             :   unsigned DeltaS, DeltaD;
    3654             :   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
    3655             :     return true;
    3656             : 
    3657        2124 :   unsigned BaseRegS, BaseRegD;
    3658        2124 :   int64_t OffsetS, OffsetD;
    3659           3 :   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
    3660             :   if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
    3661             :       !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
    3662         707 :     return true;
    3663         104 : 
    3664             :   if (BaseRegS != BaseRegD)
    3665             :     return true;
    3666         603 : 
    3667         218 :   // Check that the base register is incremented by a constant value for each
    3668             :   // iteration.
    3669             :   MachineInstr *Def = MRI.getVRegDef(BaseRegS);
    3670             :   if (!Def || !Def->isPHI())
    3671         385 :     return true;
    3672         770 :   unsigned InitVal = 0;
    3673         385 :   unsigned LoopVal = 0;
    3674           0 :   getPhiRegs(*Def, BB, InitVal, LoopVal);
    3675             :   MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
    3676         385 :   int D = 0;
    3677             :   if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
    3678             :     return true;
    3679             : 
    3680             :   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
    3681         118 :   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
    3682         118 : 
    3683             :   // This is the main test, which checks the offset values and the loop
    3684             :   // increment value to determine if the accesses may be loop carried.
    3685             :   if (OffsetS >= OffsetD)
    3686         118 :     return OffsetS + AccessSizeS > DeltaS;
    3687         118 :   else
    3688         118 :     return OffsetD + AccessSizeD > DeltaD;
    3689         118 : 
    3690           0 :   return true;
    3691             : }
    3692         118 : 
    3693         118 : void SwingSchedulerDAG::postprocessDAG() {
    3694             :   for (auto &M : Mutations)
    3695             :     M->apply(this);
    3696             : }
    3697         118 : 
    3698          72 : /// Try to schedule the node at the specified StartCycle and continue
    3699             : /// until the node is schedule or the EndCycle is reached.  This function
    3700          46 : /// returns true if the node is scheduled.  This routine may search either
    3701             : /// forward or backward for a place to insert the instruction based upon
    3702             : /// the relative values of StartCycle and EndCycle.
    3703             : bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
    3704             :   bool forward = true;
    3705             :   if (StartCycle > EndCycle)
    3706         760 :     forward = false;
    3707         570 : 
    3708             :   // The terminating condition depends on the direction.
    3709             :   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
    3710             :   for (int curCycle = StartCycle; curCycle != termCycle;
    3711             :        forward ? ++curCycle : --curCycle) {
    3712             : 
    3713             :     // Add the already scheduled instructions at the specified cycle to the DFA.
    3714             :     Resources->clearResources();
    3715        2952 :     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
    3716             :          checkCycle <= LastCycle; checkCycle += II) {
    3717        2952 :       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
    3718             : 
    3719             :       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
    3720             :                                          E = cycleInstrs.end();
    3721        2952 :            I != E; ++I) {
    3722        4686 :         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
    3723             :           continue;
    3724             :         assert(Resources->canReserveResources(*(*I)->getInstr()) &&
    3725             :                "These instructions have already been scheduled.");
    3726             :         Resources->reserveResources(*(*I)->getInstr());
    3727        3807 :       }
    3728        8642 :     }
    3729        4835 :     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
    3730             :         Resources->canReserveResources(*SU->getInstr())) {
    3731             :       LLVM_DEBUG({
    3732             :         dbgs() << "\tinsert at cycle " << curCycle << " ";
    3733       10479 :         SU->getInstr()->dump();
    3734        5644 :       });
    3735             : 
    3736             :       ScheduledInstrs[curCycle].push_back(SU);
    3737             :       InstrToCycle.insert(std::make_pair(SU, curCycle));
    3738        4339 :       if (curCycle > LastCycle)
    3739             :         LastCycle = curCycle;
    3740             :       if (curCycle < FirstCycle)
    3741        7045 :         FirstCycle = curCycle;
    3742        3238 :       return true;
    3743             :     }
    3744             :     LLVM_DEBUG({
    3745             :       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
    3746             :       SU->getInstr()->dump();
    3747             :     });
    3748        2940 :   }
    3749        2940 :   return false;
    3750        2940 : }
    3751         424 : 
    3752        2940 : // Return the cycle of the earliest scheduled instruction in the chain.
    3753         110 : int SMSchedule::earliestCycleInChain(const SDep &Dep) {
    3754        2940 :   SmallPtrSet<SUnit *, 8> Visited;
    3755             :   SmallVector<SDep, 8> Worklist;
    3756             :   Worklist.push_back(Dep);
    3757             :   int EarlyCycle = INT_MAX;
    3758             :   while (!Worklist.empty()) {
    3759             :     const SDep &Cur = Worklist.pop_back_val();
    3760             :     SUnit *PrevSU = Cur.getSUnit();
    3761          12 :     if (Visited.count(PrevSU))
    3762             :       continue;
    3763             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
    3764             :     if (it == InstrToCycle.end())
    3765          16 :       continue;
    3766             :     EarlyCycle = std::min(EarlyCycle, it->second);
    3767             :     for (const auto &PI : PrevSU->Preds)
    3768          16 :       if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
    3769          16 :         Worklist.push_back(PI);
    3770          33 :     Visited.insert(PrevSU);
    3771          17 :   }
    3772          17 :   return EarlyCycle;
    3773          17 : }
    3774           1 : 
    3775             : // Return the cycle of the latest scheduled instruction in the chain.
    3776          17 : int SMSchedule::latestCycleInChain(const SDep &Dep) {
    3777             :   SmallPtrSet<SUnit *, 8> Visited;
    3778          16 :   SmallVector<SDep, 8> Worklist;
    3779          41 :   Worklist.push_back(Dep);
    3780          25 :   int LateCycle = INT_MIN;
    3781           1 :   while (!Worklist.empty()) {
    3782          16 :     const SDep &Cur = Worklist.pop_back_val();
    3783             :     SUnit *SuccSU = Cur.getSUnit();
    3784          16 :     if (Visited.count(SuccSU))
    3785             :       continue;
    3786             :     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
    3787             :     if (it == InstrToCycle.end())
    3788         356 :       continue;
    3789             :     LateCycle = std::max(LateCycle, it->second);
    3790             :     for (const auto &SI : SuccSU->Succs)
    3791         356 :       if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
    3792         356 :         Worklist.push_back(SI);
    3793        1093 :     Visited.insert(SuccSU);
    3794         737 :   }
    3795         737 :   return LateCycle;
    3796         737 : }
    3797         140 : 
    3798             : /// If an instruction has a use that spans multiple iterations, then
    3799         633 : /// return true. These instructions are characterized by having a back-ege
    3800             : /// to a Phi, which contains a reference to another Phi.
    3801         597 : static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
    3802        1127 :   for (auto &P : SU->Preds)
    3803         530 :     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
    3804         381 :       for (auto &S : P.getSUnit()->Succs)
    3805         597 :         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
    3806             :           return P.getSUnit();
    3807         356 :   return nullptr;
    3808             : }
    3809             : 
    3810             : /// Compute the scheduling start slot for the instruction.  The start slot
    3811             : /// depends on any predecessor or successor nodes scheduled already.
    3812             : void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
    3813           0 :                               int *MinEnd, int *MaxStart, int II,
    3814           0 :                               SwingSchedulerDAG *DAG) {
    3815           0 :   // Iterate over each instruction that has been scheduled already.  The start
    3816           0 :   // slot computation depends on whether the previously scheduled instruction
    3817           0 :   // is a predecessor or successor of the specified instruction.
    3818             :   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
    3819             : 
    3820             :     // Iterate over each instruction in the current cycle.
    3821             :     for (SUnit *I : getInstructions(cycle)) {
    3822             :       // Because we're processing a DAG for the dependences, we recognize
    3823             :       // the back-edge in recurrences by anti dependences.
    3824        3030 :       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
    3825             :         const SDep &Dep = SU->Preds[i];
    3826             :         if (Dep.getSUnit() == I) {
    3827             :           if (!DAG->isBackedge(SU, Dep)) {
    3828             :             int EarlyStart = cycle + Dep.getLatency() -
    3829             :                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
    3830       26824 :             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
    3831             :             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
    3832             :               int End = earliestCycleInChain(Dep) + (II - 1);
    3833       73425 :               *MinEnd = std::min(*MinEnd, End);
    3834             :             }
    3835             :           } else {
    3836       63805 :             int LateStart = cycle - Dep.getLatency() +
    3837       37968 :                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
    3838       37968 :             *MinLateStart = std::min(*MinLateStart, LateStart);
    3839        1109 :           }
    3840        1063 :         }
    3841        1063 :         // For instruction that requires multiple iterations, make sure that
    3842        1063 :         // the dependent instruction is not scheduled past the definition.
    3843        1063 :         SUnit *BE = multipleIterations(I, DAG);
    3844          16 :         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
    3845          16 :             !SU->isPred(I))
    3846             :           *MinLateStart = std::min(*MinLateStart, cycle);
    3847             :       }
    3848          46 :       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
    3849          46 :         if (SU->Succs[i].getSUnit() == I) {
    3850          46 :           const SDep &Dep = SU->Succs[i];
    3851             :           if (!DAG->isBackedge(SU, Dep)) {
    3852             :             int LateStart = cycle - Dep.getLatency() +
    3853             :                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
    3854             :             *MinLateStart = std::min(*MinLateStart, LateStart);
    3855       37968 :             if (DAG->isLoopCarriedDep(SU, Dep)) {
    3856       37977 :               int Start = latestCycleInChain(Dep) + 1 - II;
    3857             :               *MaxStart = std::max(*MaxStart, Start);
    3858           9 :             }
    3859             :           } else {
    3860       65665 :             int EarlyStart = cycle + Dep.getLatency() -
    3861       79656 :                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
    3862             :             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
    3863        3007 :           }
    3864        2641 :         }
    3865        2641 :       }
    3866        2641 :     }
    3867        2641 :   }
    3868         356 : }
    3869         356 : 
    3870             : /// Order the instructions within a cycle so that the definitions occur
    3871             : /// before the uses. Returns true if the instruction is added to the start
    3872         366 : /// of the list, or false if added to the end.
    3873         366 : void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
    3874         366 :                                  std::deque<SUnit *> &Insts) {
    3875             :   MachineInstr *MI = SU->getInstr();
    3876             :   bool OrderBeforeUse = false;
    3877             :   bool OrderAfterDef = false;
    3878             :   bool OrderBeforeDef = false;
    3879             :   unsigned MoveDef = 0;
    3880        3030 :   unsigned MoveUse = 0;
    3881             :   int StageInst1 = stageScheduled(SU);
    3882             : 
    3883             :   unsigned Pos = 0;
    3884             :   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
    3885        1398 :        ++I, ++Pos) {
    3886             :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    3887        1398 :       MachineOperand &MO = MI->getOperand(i);
    3888             :       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    3889             :         continue;
    3890             : 
    3891             :       unsigned Reg = MO.getReg();
    3892             :       unsigned BasePos, OffsetPos;
    3893        1398 :       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
    3894             :         if (MI->getOperand(BasePos).getReg() == Reg)
    3895             :           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
    3896        2825 :             Reg = NewReg;
    3897             :       bool Reads, Writes;
    3898        6127 :       std::tie(Reads, Writes) =
    3899        4700 :           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
    3900        4700 :       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
    3901        1045 :         OrderBeforeUse = true;
    3902             :         if (MoveUse == 0)
    3903             :           MoveUse = Pos;
    3904             :       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
    3905        3655 :         // Add the instruction after the scheduled instruction.
    3906        1818 :         OrderAfterDef = true;
    3907         487 :         MoveDef = Pos;
    3908             :       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
    3909             :         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
    3910             :           OrderBeforeUse = true;
    3911        3655 :           if (MoveUse == 0)
    3912        3877 :             MoveUse = Pos;
    3913             :         } else {
    3914          97 :           OrderAfterDef = true;
    3915             :           MoveDef = Pos;
    3916        3683 :         }
    3917             :       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
    3918             :         OrderBeforeUse = true;
    3919             :         if (MoveUse == 0)
    3920        3552 :           MoveUse = Pos;
    3921          84 :         if (MoveUse != 0) {
    3922             :           OrderAfterDef = true;
    3923           0 :           MoveDef = Pos - 1;
    3924             :         }
    3925             :       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
    3926             :         // Add the instruction before the scheduled instruction.
    3927             :         OrderBeforeUse = true;
    3928             :         if (MoveUse == 0)
    3929        3468 :           MoveUse = Pos;
    3930             :       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
    3931          28 :                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
    3932             :         if (MoveUse == 0) {
    3933          28 :           OrderBeforeDef = true;
    3934             :           MoveUse = Pos;
    3935          12 :         }
    3936             :       }
    3937        3412 :     }
    3938             :     // Check for order dependences between instructions. Make sure the source
    3939             :     // is ordered before the destination.
    3940          49 :     for (auto &S : SU->Succs) {
    3941             :       if (S.getSUnit() != *I)
    3942        6635 :         continue;
    3943        1196 :       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
    3944          98 :         OrderBeforeUse = true;
    3945             :         if (Pos < MoveUse)
    3946             :           MoveUse = Pos;
    3947             :       }
    3948             :     }
    3949             :     for (auto &P : SU->Preds) {
    3950             :       if (P.getSUnit() != *I)
    3951             :         continue;
    3952        2993 :       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
    3953        1566 :         OrderAfterDef = true;
    3954             :         MoveDef = Pos;
    3955         266 :       }
    3956             :     }
    3957           6 :   }
    3958             : 
    3959             :   // A circular dependence.
    3960             :   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
    3961        3467 :     OrderBeforeUse = false;
    3962        2040 : 
    3963             :   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
    3964          99 :   // to a loop-carried dependence.
    3965             :   if (OrderBeforeDef)
    3966             :     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
    3967             : 
    3968             :   // The uncommon case when the instruction order needs to be updated because
    3969             :   // there is both a use and def.
    3970             :   if (OrderBeforeUse && OrderAfterDef) {
    3971             :     SUnit *UseSU = Insts.at(MoveUse);
    3972        1398 :     SUnit *DefSU = Insts.at(MoveDef);
    3973             :     if (MoveUse > MoveDef) {
    3974             :       Insts.erase(Insts.begin() + MoveUse);
    3975             :       Insts.erase(Insts.begin() + MoveDef);
    3976             :     } else {
    3977        1398 :       Insts.erase(Insts.begin() + MoveDef);
    3978          97 :       Insts.erase(Insts.begin() + MoveUse);
    3979             :     }
    3980             :     orderDependence(SSD, UseSU, Insts);
    3981             :     orderDependence(SSD, SU, Insts);
    3982        1398 :     orderDependence(SSD, DefSU, Insts);
    3983          47 :     return;
    3984          47 :   }
    3985          47 :   // Put the new instruction first if there is a use in the list. Otherwise,
    3986          86 :   // put it at the end of the list.
    3987         129 :   if (OrderBeforeUse)
    3988             :     Insts.push_front(SU);
    3989           8 :   else
    3990          12 :     Insts.push_back(SU);
    3991             : }
    3992          47 : 
    3993          47 : /// Return true if the scheduled Phi has a loop carried operand.
    3994          47 : bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
    3995          47 :   if (!Phi.isPHI())
    3996             :     return false;
    3997             :   assert(Phi.isPHI() && "Expecting a Phi.");
    3998             :   SUnit *DefSU = SSD->getSUnit(&Phi);
    3999        1351 :   unsigned DefCycle = cycleScheduled(DefSU);
    4000             :   int DefStage = stageScheduled(DefSU);
    4001             : 
    4002             :   unsigned InitVal = 0;
    4003             :   unsigned LoopVal = 0;
    4004             :   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
    4005             :   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
    4006        2149 :   if (!UseSU)
    4007             :     return true;
    4008             :   if (UseSU->getInstr()->isPHI())
    4009             :     return true;
    4010             :   unsigned LoopCycle = cycleScheduled(UseSU);
    4011        1517 :   int LoopStage = stageScheduled(UseSU);
    4012        1517 :   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
    4013             : }
    4014             : 
    4015             : /// Return true if the instruction is a definition that is loop carried
    4016        1517 : /// and defines the use on the next iteration.
    4017        1517 : ///        v1 = phi(v2, v3)
    4018        1515 : ///  (Def) v3 = op v1
    4019           2 : ///  (MO)   = v1
    4020        1515 : /// If MO appears before Def, then then v1 and v3 may get assigned to the same
    4021             : /// register.
    4022        1469 : bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
    4023        1469 :                                        MachineInstr *Def, MachineOperand &MO) {
    4024        1469 :   if (!MO.isReg())
    4025             :     return false;
    4026             :   if (Def->isPHI())
    4027             :     return false;
    4028             :   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
    4029             :   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
    4030             :     return false;
    4031             :   if (!isLoopCarried(SSD, *Phi))
    4032             :     return false;
    4033             :   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
    4034        1196 :   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
    4035             :     MachineOperand &DMO = Def->getOperand(i);
    4036        1196 :     if (!DMO.isReg() || !DMO.isDef())
    4037             :       continue;
    4038             :     if (DMO.getReg() == LoopReg)
    4039             :       return true;
    4040        1196 :   }
    4041        1561 :   return false;
    4042             : }
    4043         348 : 
    4044             : // Check if the generated schedule is valid. This function checks if
    4045         328 : // an instruction that uses a physical register is scheduled in a
    4046        1093 : // different stage than the definition. The pipeliner does not handle
    4047         863 : // physical register values that may cross a basic block boundary.
    4048         863 : bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
    4049             :   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
    4050         327 :     SUnit &SU = SSD->SUnits[i];
    4051             :     if (!SU.hasPhysRegDefs)
    4052             :       continue;
    4053             :     int StageDef = stageScheduled(&SU);
    4054             :     assert(StageDef != -1 && "Instruction should have been scheduled.");
    4055             :     for (auto &SI : SU.Succs)
    4056             :       if (SI.isAssignedRegDep())
    4057             :         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
    4058             :           if (stageScheduled(SI.getSUnit()) != StageDef)
    4059             :             return false;
    4060         175 :   }
    4061        1983 :   return true;
    4062        1633 : }
    4063        1633 : 
    4064             : /// A property of the node order in swing-modulo-scheduling is
    4065           0 : /// that for nodes outside circuits the following holds:
    4066             : /// none of them is scheduled after both a successor and a
    4067           0 : /// predecessor.
    4068             : /// The method below checks whether the property is met.
    4069           0 : /// If not, debug information is printed and statistics information updated.
    4070           0 : /// Note that we do not use an assert statement.
    4071             : /// The reason is that although an invalid node oder may prevent
    4072             : /// the pipeliner from finding a pipelined schedule for arbitrary II,
    4073             : /// it does not lead to the generation of incorrect code.
    4074             : void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
    4075             : 
    4076             :   // a sorted vector that maps each SUnit to its index in the NodeOrder
    4077             :   typedef std::pair<SUnit *, unsigned> UnitIndex;
    4078             :   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
    4079             : 
    4080             :   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
    4081             :     Indices.push_back(std::make_pair(NodeOrder[i], i));
    4082             : 
    4083             :   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
    4084             :     return std::get<0>(i1) < std::get<0>(i2);
    4085             :   };
    4086         188 : 
    4087             :   // sort, so that we can perform a binary search
    4088             :   llvm::sort(Indices, CompareKey);
    4089             : 
    4090         188 :   bool Valid = true;
    4091             :   (void)Valid;
    4092        2033 :   // for each SUnit in the NodeOrder, check whether
    4093        3690 :   // it appears after both a successor and a predecessor
    4094             :   // of the SUnit. If this is the case, and the SUnit
    4095             :   // is not part of circuit, then the NodeOrder is not
    4096           0 :   // valid.
    4097             :   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
    4098             :     SUnit *SU = NodeOrder[i];
    4099             :     unsigned Index = i;
    4100             : 
    4101             :     bool PredBefore = false;
    4102             :     bool SuccBefore = false;
    4103             : 
    4104             :     SUnit *Succ;
    4105             :     SUnit *Pred;
    4106             :     (void)Succ;
    4107             :     (void)Pred;
    4108             : 
    4109        2033 :     for (SDep &PredEdge : SU->Preds) {
    4110        3690 :       SUnit *PredSU = PredEdge.getSUnit();
    4111             :       unsigned PredIndex =
    4112             :           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
    4113             :                                         std::make_pair(PredSU, 0), CompareKey));
    4114             :       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
    4115             :         PredBefore = true;
    4116             :         Pred = PredSU;
    4117             :         break;
    4118             :       }
    4119             :     }
    4120             : 
    4121        3761 :     for (SDep &SuccEdge : SU->Succs) {
    4122             :       SUnit *SuccSU = SuccEdge.getSUnit();
    4123             :       unsigned SuccIndex =
    4124             :           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
    4125        2256 :                                         std::make_pair(SuccSU, 0), CompareKey));
    4126        3572 :       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
    4127             :         SuccBefore = true;
    4128             :         Succ = SuccSU;
    4129             :         break;
    4130             :       }
    4131             :     }
    4132             : 
    4133        2420 :     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
    4134             :       // instructions in circuits are allowed to be scheduled
    4135             :       // after both a successor and predecessor.
    4136             :       bool InCircuit = std::any_of(
    4137        1703 :           Circuits.begin(), Circuits.end(),
    4138        3396 :           [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
    4139             :       if (InCircuit)
    4140             :         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
    4141             :       else {
    4142             :         Valid = false;
    4143             :         NumNodeOrderIssues++;
    4144             :         LLVM_DEBUG(dbgs() << "Predecessor ";);
    4145        1845 :       }
    4146             :       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
    4147             :                         << " are scheduled before node " << SU->NodeNum
    4148             :                         << "\n";);
    4149             :     }
    4150             :   }
    4151             : 
    4152             :   LLVM_DEBUG({
    4153             :     if (!Valid)
    4154             :       dbgs() << "Invalid node order found!\n";
    4155             :   });
    4156             : }
    4157             : 
    4158             : /// Attempt to fix the degenerate cases when the instruction serialization
    4159             : /// causes the register lifetimes to overlap. For example,
    4160             : ///   p' = store_pi(p, b)
    4161             : ///      = load p, offset
    4162             : /// In this case p and p' overlap, which means that two registers are needed.
    4163             : /// Instead, this function changes the load to use p' and updates the offset.
    4164             : void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
    4165             :   unsigned OverlapReg = 0;
    4166             :   unsigned NewBaseReg = 0;
    4167             :   for (SUnit *SU : Instrs) {
    4168         188 :     MachineInstr *MI = SU->getInstr();
    4169             :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    4170             :       const MachineOperand &MO = MI->getOperand(i);
    4171             :       // Look for an instruction that uses p. The instruction occurs in the
    4172             :       // same cycle but occurs later in the serialized order.
    4173             :       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
    4174             :         // Check that the instruction appears in the InstrChanges structure,
    4175             :         // which contains instructions that can have the offset updated.
    4176         500 :         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
    4177             :           InstrChanges.find(SU);
    4178             :         if (It != InstrChanges.end()) {
    4179        2133 :           unsigned BasePos, OffsetPos;
    4180        1633 :           // Update the base register and adjust the offset.
    4181        6817 :           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
    4182        5434 :             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
    4183             :             NewMI->getOperand(BasePos).setReg(NewBaseReg);
    4184             :             int64_t NewOffset =
    4185        5434 :                 MI->getOperand(OffsetPos).getImm() - It->second.second;
    4186             :             NewMI->getOperand(OffsetPos).setImm(NewOffset);
    4187             :             SU->setInstr(NewMI);
    4188             :             MISUnitMap[NewMI] = SU;
    4189           2 :             NewMIs.insert(NewMI);
    4190           2 :           }
    4191             :         }
    4192             :         OverlapReg = 0;
    4193           2 :         NewBaseReg = 0;
    4194           2 :         break;
    4195           4 :       }
    4196             :       // Look for an instruction of the form p' = op(p), which uses and defines
    4197           2 :       // two virtual registers that get allocated to the same physical register.
    4198           2 :       unsigned TiedUseIdx = 0;
    4199             :       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
    4200           2 :         // OverlapReg is p in the example above.
    4201           2 :         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
    4202             :         // NewBaseReg is p' in the example above.
    4203             :         NewBaseReg = MI->getOperand(i).getReg();
    4204             :         break;
    4205             :       }
    4206             :     }
    4207             :   }
    4208             : }
    4209             : 
    4210        5432 : /// After the schedule has been formed, call this function to combine
    4211        5432 : /// the instructions from the different stages/cycles.  That is, this
    4212             : /// function creates a schedule that represents a single iteration.
    4213         496 : void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
    4214             :   // Move all instructions to the first stage from later stages.
    4215         248 :   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
    4216         248 :     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
    4217             :          ++stage) {
    4218             :       std::deque<SUnit *> &cycleInstrs =
    4219             :           ScheduledInstrs[cycle + (stage * InitiationInterval)];
    4220         500 :       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
    4221             :                                                  E = cycleInstrs.rend();
    4222             :            I != E; ++I)
    4223             :         ScheduledInstrs[cycle].push_front(*I);
    4224             :     }
    4225         175 :   }
    4226             :   // Iterate over the definitions in each instruction, and compute the
    4227         675 :   // stage difference for each use.  Keep the maximum value.
    4228         983 :   for (auto &I : InstrToCycle) {
    4229             :     int DefStage = stageScheduled(I.first);
    4230             :     MachineInstr *MI = I.first->getInstr();
    4231         483 :     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
    4232             :       MachineOperand &Op = MI->getOperand(i);
    4233             :       if (!Op.isReg() || !Op.isDef())
    4234        1107 :         continue;
    4235             : 
    4236             :       unsigned Reg = Op.getReg();
    4237             :       unsigned MaxDiff = 0;
    4238             :       bool PhiIsSwapped = false;
    4239             :       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
    4240        1808 :                                              EI = MRI.use_end();
    4241        1633 :            UI != EI; ++UI) {
    4242        1633 :         MachineOperand &UseOp = *UI;
    4243        7733 :         MachineInstr *UseMI = UseOp.getParent();
    4244        6100 :         SUnit *SUnitUse = SSD->getSUnit(UseMI);
    4245        6100 :         int UseStage = stageScheduled(SUnitUse);
    4246        4476 :         unsigned Diff = 0;
    4247             :         if (UseStage != -1 && UseStage >= DefStage)
    4248        1624 :           Diff = UseStage - DefStage;
    4249        1624 :         if (MI->isPHI()) {
    4250             :           if (isLoopCarried(SSD, *MI))
    4251        1624 :             ++Diff;
    4252             :           else
    4253        3789 :             PhiIsSwapped = true;
    4254             :         }
    4255        2165 :         MaxDiff = std::max(Diff, MaxDiff);
    4256             :       }
    4257        2165 :       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
    4258        2165 :     }
    4259        2165 :   }
    4260        2058 : 
    4261             :   // Erase all the elements in the later stages. Only one iteration should
    4262         659 :   // remain in the scheduled list, and it contains all the instructions.
    4263         632 :   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
    4264             :     ScheduledInstrs.erase(cycle);
    4265             : 
    4266             :   // Change the registers in instruction as specified in the InstrChanges
    4267        2165 :   // map. We need to use the new registers to create the correct order.
    4268             :   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
    4269        1624 :     SUnit *SU = &SSD->SUnits[i];
    4270             :     SSD->applyInstrChange(SU->getInstr(), *this);
    4271             :   }
    4272             : 
    4273             :   // Reorder the instructions in each cycle to fix and improve the
    4274             :   // generated code.
    4275         591 :   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
    4276         416 :     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
    4277             :     std::deque<SUnit *> newOrderPhi;
    4278             :     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
    4279             :       SUnit *SU = cycleInstrs[i];
    4280        1983 :       if (SU->getInstr()->isPHI())
    4281        1633 :         newOrderPhi.push_back(SU);
    4282        1633 :     }
    4283             :     std::deque<SUnit *> newOrderI;
    4284             :     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
    4285             :       SUnit *SU = cycleInstrs[i];
    4286             :       if (!SU->getInstr()->isPHI())
    4287         675 :         orderDependence(SSD, SU, newOrderI);
    4288         500 :     }
    4289             :     // Replace the old order with the new order.
    4290        2133 :     cycleInstrs.swap(newOrderPhi);
    4291        1633 :     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
    4292        1633 :     SSD->fixupRegisterOverlaps(cycleInstrs);
    4293             :   }
    4294             : 
    4295             :   LLVM_DEBUG(dump(););
    4296        2133 : }
    4297        1633 : 
    4298        1633 : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    4299        1257 : /// Print the schedule information to the given output.
    4300             : void SMSchedule::print(raw_ostream &os) const {
    4301             :   // Iterate over each cycle.
    4302             :   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
    4303         500 :     // Iterate over each instruction in the cycle.
    4304         500 :     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
    4305             :     for (SUnit *CI : cycleInstrs->second) {
    4306             :       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
    4307             :       os << "(" << CI->NodeNum << ") ";
    4308         175 :       CI->getInstr()->print(os);
    4309             :       os << "\n";
    4310             :     }
    4311             :   }
    4312             : }
    4313             : 
    4314             : /// Utility function used for debugging to print the schedule.
    4315             : LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
    4316             : #endif

Generated by: LCOV version 1.13