LLVM  8.0.0svn
MachinePipeliner.cpp
Go to the documentation of this file.
1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 //
12 // Software pipelining (SWP) is an instruction scheduling technique for loops
13 // that overlap loop iterations and exploits ILP via a compiler transformation.
14 //
15 // Swing Modulo Scheduling is an implementation of software pipelining
16 // that generates schedules that are near optimal in terms of initiation
17 // interval, register requirements, and stage count. See the papers:
18 //
19 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
20 // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
21 // Conference on Parallel Architectures and Compilation Techiniques.
22 //
23 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
24 // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
25 // Transactions on Computers, Vol. 50, No. 3, 2001.
26 //
27 // "An Implementation of Swing Modulo Scheduling With Extensions for
28 // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
29 // Urbana-Chambpain, 2005.
30 //
31 //
32 // The SMS algorithm consists of three main steps after computing the minimal
33 // initiation interval (MII).
34 // 1) Analyze the dependence graph and compute information about each
35 // instruction in the graph.
36 // 2) Order the nodes (instructions) by priority based upon the heuristics
37 // described in the algorithm.
38 // 3) Attempt to schedule the nodes in the specified order using the MII.
39 //
40 // This SMS implementation is a target-independent back-end pass. When enabled,
41 // the pass runs just prior to the register allocation pass, while the machine
42 // IR is in SSA form. If software pipelining is successful, then the original
43 // loop is replaced by the optimized loop. The optimized loop contains one or
44 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
45 // the instructions cannot be scheduled in a given MII, we increase the MII by
46 // one and try again.
47 //
48 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
49 // represent loop carried dependences in the DAG as order edges to the Phi
50 // nodes. We also perform several passes over the DAG to eliminate unnecessary
51 // edges that inhibit the ability to pipeline. The implementation uses the
52 // DFAPacketizer class to compute the minimum initiation interval and the check
53 // where an instruction may be inserted in the pipelined schedule.
54 //
55 // In order for the SMS pass to work, several target specific hooks need to be
56 // implemented to get information about the loop structure and to rewrite
57 // instructions.
58 //
59 //===----------------------------------------------------------------------===//
60 
61 #include "llvm/ADT/ArrayRef.h"
62 #include "llvm/ADT/BitVector.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PriorityQueue.h"
66 #include "llvm/ADT/SetVector.h"
67 #include "llvm/ADT/SmallPtrSet.h"
68 #include "llvm/ADT/SmallSet.h"
69 #include "llvm/ADT/SmallVector.h"
70 #include "llvm/ADT/Statistic.h"
96 #include "llvm/Config/llvm-config.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/DebugLoc.h"
99 #include "llvm/IR/Function.h"
100 #include "llvm/MC/LaneBitmask.h"
101 #include "llvm/MC/MCInstrDesc.h"
103 #include "llvm/MC/MCRegisterInfo.h"
104 #include "llvm/Pass.h"
106 #include "llvm/Support/Compiler.h"
107 #include "llvm/Support/Debug.h"
108 #include "llvm/Support/MathExtras.h"
110 #include <algorithm>
111 #include <cassert>
112 #include <climits>
113 #include <cstdint>
114 #include <deque>
115 #include <functional>
116 #include <iterator>
117 #include <map>
118 #include <memory>
119 #include <tuple>
120 #include <utility>
121 #include <vector>
122 
123 using namespace llvm;
124 
125 #define DEBUG_TYPE "pipeliner"
126 
127 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
128 STATISTIC(NumPipelined, "Number of loops software pipelined");
129 STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
130 
131 /// A command line option to turn software pipelining on or off.
132 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
134  cl::desc("Enable Software Pipelining"));
135 
136 /// A command line option to enable SWP at -Os.
137 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
138  cl::desc("Enable SWP at Os."), cl::Hidden,
139  cl::init(false));
140 
141 /// A command line argument to limit minimum initial interval for pipelining.
142 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
143  cl::desc("Size limit for the MII."),
144  cl::Hidden, cl::init(27));
145 
146 /// A command line argument to limit the number of stages in the pipeline.
147 static cl::opt<int>
148  SwpMaxStages("pipeliner-max-stages",
149  cl::desc("Maximum stages allowed in the generated scheduled."),
150  cl::Hidden, cl::init(3));
151 
152 /// A command line option to disable the pruning of chain dependences due to
153 /// an unrelated Phi.
154 static cl::opt<bool>
155  SwpPruneDeps("pipeliner-prune-deps",
156  cl::desc("Prune dependences between unrelated Phi nodes."),
157  cl::Hidden, cl::init(true));
158 
159 /// A command line option to disable the pruning of loop carried order
160 /// dependences.
161 static cl::opt<bool>
162  SwpPruneLoopCarried("pipeliner-prune-loop-carried",
163  cl::desc("Prune loop carried order dependences."),
164  cl::Hidden, cl::init(true));
165 
166 #ifndef NDEBUG
167 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
168 #endif
169 
170 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
171  cl::ReallyHidden, cl::init(false),
172  cl::ZeroOrMore, cl::desc("Ignore RecMII"));
173 
174 namespace {
175 
176 class NodeSet;
177 class SMSchedule;
178 
179 /// The main class in the implementation of the target independent
180 /// software pipeliner pass.
181 class MachinePipeliner : public MachineFunctionPass {
182 public:
183  MachineFunction *MF = nullptr;
184  const MachineLoopInfo *MLI = nullptr;
185  const MachineDominatorTree *MDT = nullptr;
186  const InstrItineraryData *InstrItins;
187  const TargetInstrInfo *TII = nullptr;
188  RegisterClassInfo RegClassInfo;
189 
190 #ifndef NDEBUG
191  static int NumTries;
192 #endif
193 
194  /// Cache the target analysis information about the loop.
195  struct LoopInfo {
196  MachineBasicBlock *TBB = nullptr;
197  MachineBasicBlock *FBB = nullptr;
199  MachineInstr *LoopInductionVar = nullptr;
200  MachineInstr *LoopCompare = nullptr;
201  };
202  LoopInfo LI;
203 
204  static char ID;
205 
206  MachinePipeliner() : MachineFunctionPass(ID) {
208  }
209 
210  bool runOnMachineFunction(MachineFunction &MF) override;
211 
212  void getAnalysisUsage(AnalysisUsage &AU) const override {
219  }
220 
221 private:
222  void preprocessPhiNodes(MachineBasicBlock &B);
223  bool canPipelineLoop(MachineLoop &L);
224  bool scheduleLoop(MachineLoop &L);
225  bool swingModuloScheduler(MachineLoop &L);
226 };
227 
228 /// This class builds the dependence graph for the instructions in a loop,
229 /// and attempts to schedule the instructions using the SMS algorithm.
230 class SwingSchedulerDAG : public ScheduleDAGInstrs {
231  MachinePipeliner &Pass;
232  /// The minimum initiation interval between iterations for this schedule.
233  unsigned MII = 0;
234  /// Set to true if a valid pipelined schedule is found for the loop.
235  bool Scheduled = false;
236  MachineLoop &Loop;
237  LiveIntervals &LIS;
238  const RegisterClassInfo &RegClassInfo;
239 
240  /// A toplogical ordering of the SUnits, which is needed for changing
241  /// dependences and iterating over the SUnits.
243 
244  struct NodeInfo {
245  int ASAP = 0;
246  int ALAP = 0;
247  int ZeroLatencyDepth = 0;
248  int ZeroLatencyHeight = 0;
249 
250  NodeInfo() = default;
251  };
252  /// Computed properties for each node in the graph.
253  std::vector<NodeInfo> ScheduleInfo;
254 
255  enum OrderKind { BottomUp = 0, TopDown = 1 };
256  /// Computed node ordering for scheduling.
258 
259  using NodeSetType = SmallVector<NodeSet, 8>;
260  using ValueMapTy = DenseMap<unsigned, unsigned>;
261  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
262  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
263 
264  /// Instructions to change when emitting the final schedule.
266 
267  /// We may create a new instruction, so remember it because it
268  /// must be deleted when the pass is finished.
270 
271  /// Ordered list of DAG postprocessing steps.
272  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
273 
274  /// Helper class to implement Johnson's circuit finding algorithm.
275  class Circuits {
276  std::vector<SUnit> &SUnits;
277  SetVector<SUnit *> Stack;
278  BitVector Blocked;
281  unsigned NumPaths;
282  static unsigned MaxPaths;
283 
284  public:
285  Circuits(std::vector<SUnit> &SUs)
286  : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
287 
288  /// Reset the data structures used in the circuit algorithm.
289  void reset() {
290  Stack.clear();
291  Blocked.reset();
292  B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
293  NumPaths = 0;
294  }
295 
296  void createAdjacencyStructure(SwingSchedulerDAG *DAG);
297  bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
298  void unblock(int U);
299  };
300 
301 public:
302  SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
303  const RegisterClassInfo &rci)
304  : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
305  RegClassInfo(rci), Topo(SUnits, &ExitSU) {
306  P.MF->getSubtarget().getSMSMutations(Mutations);
307  }
308 
309  void schedule() override;
310  void finishBlock() override;
311 
312  /// Return true if the loop kernel has been scheduled.
313  bool hasNewSchedule() { return Scheduled; }
314 
315  /// Return the earliest time an instruction may be scheduled.
316  int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
317 
318  /// Return the latest time an instruction my be scheduled.
319  int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
320 
321  /// The mobility function, which the number of slots in which
322  /// an instruction may be scheduled.
323  int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
324 
325  /// The depth, in the dependence graph, for a node.
326  unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
327 
328  /// The maximum unweighted length of a path from an arbitrary node to the
329  /// given node in which each edge has latency 0
330  int getZeroLatencyDepth(SUnit *Node) {
331  return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
332  }
333 
334  /// The height, in the dependence graph, for a node.
335  unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
336 
337  /// The maximum unweighted length of a path from the given node to an
338  /// arbitrary node in which each edge has latency 0
339  int getZeroLatencyHeight(SUnit *Node) {
340  return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
341  }
342 
343  /// Return true if the dependence is a back-edge in the data dependence graph.
344  /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
345  /// using an anti dependence from a Phi to an instruction.
346  bool isBackedge(SUnit *Source, const SDep &Dep) {
347  if (Dep.getKind() != SDep::Anti)
348  return false;
349  return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
350  }
351 
352  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
353 
354  /// The distance function, which indicates that operation V of iteration I
355  /// depends on operations U of iteration I-distance.
356  unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
357  // Instructions that feed a Phi have a distance of 1. Computing larger
358  // values for arrays requires data dependence information.
359  if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
360  return 1;
361  return 0;
362  }
363 
364  /// Set the Minimum Initiation Interval for this schedule attempt.
365  void setMII(unsigned mii) { MII = mii; }
366 
367  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
368 
369  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
370 
371  /// Return the new base register that was stored away for the changed
372  /// instruction.
373  unsigned getInstrBaseReg(SUnit *SU) {
375  InstrChanges.find(SU);
376  if (It != InstrChanges.end())
377  return It->second.first;
378  return 0;
379  }
380 
381  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
382  Mutations.push_back(std::move(Mutation));
383  }
384 
385 private:
386  void addLoopCarriedDependences(AliasAnalysis *AA);
387  void updatePhiDependences();
388  void changeDependences();
389  unsigned calculateResMII();
390  unsigned calculateRecMII(NodeSetType &RecNodeSets);
391  void findCircuits(NodeSetType &NodeSets);
392  void fuseRecs(NodeSetType &NodeSets);
393  void removeDuplicateNodes(NodeSetType &NodeSets);
394  void computeNodeFunctions(NodeSetType &NodeSets);
395  void registerPressureFilter(NodeSetType &NodeSets);
396  void colocateNodeSets(NodeSetType &NodeSets);
397  void checkNodeSets(NodeSetType &NodeSets);
398  void groupRemainingNodes(NodeSetType &NodeSets);
399  void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
400  SetVector<SUnit *> &NodesAdded);
401  void computeNodeOrder(NodeSetType &NodeSets);
402  void checkValidNodeOrder(const NodeSetType &Circuits) const;
403  bool schedulePipeline(SMSchedule &Schedule);
404  void generatePipelinedLoop(SMSchedule &Schedule);
405  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
406  MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
407  MBBVectorTy &PrologBBs);
408  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
409  MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
410  MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
411  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
412  MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
413  SMSchedule &Schedule, ValueMapTy *VRMap,
414  InstrMapTy &InstrMap, unsigned LastStageNum,
415  unsigned CurStageNum, bool IsLast);
416  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
417  MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
418  SMSchedule &Schedule, ValueMapTy *VRMap,
419  InstrMapTy &InstrMap, unsigned LastStageNum,
420  unsigned CurStageNum, bool IsLast);
421  void removeDeadInstructions(MachineBasicBlock *KernelBB,
422  MBBVectorTy &EpilogBBs);
423  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
424  SMSchedule &Schedule);
425  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
426  MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
427  ValueMapTy *VRMap);
428  bool computeDelta(MachineInstr &MI, unsigned &Delta);
429  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
430  unsigned Num);
431  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
432  unsigned InstStageNum);
433  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
434  unsigned InstStageNum,
435  SMSchedule &Schedule);
436  void updateInstruction(MachineInstr *NewMI, bool LastDef,
437  unsigned CurStageNum, unsigned InstrStageNum,
438  SMSchedule &Schedule, ValueMapTy *VRMap);
439  MachineInstr *findDefInLoop(unsigned Reg);
440  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
441  unsigned LoopStage, ValueMapTy *VRMap,
442  MachineBasicBlock *BB);
443  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
444  SMSchedule &Schedule, ValueMapTy *VRMap,
445  InstrMapTy &InstrMap);
446  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
447  InstrMapTy &InstrMap, unsigned CurStageNum,
448  unsigned PhiNum, MachineInstr *Phi,
449  unsigned OldReg, unsigned NewReg,
450  unsigned PrevReg = 0);
451  bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
452  unsigned &OffsetPos, unsigned &NewBase,
453  int64_t &NewOffset);
454  void postprocessDAG();
455 };
456 
457 /// A NodeSet contains a set of SUnit DAG nodes with additional information
458 /// that assigns a priority to the set.
459 class NodeSet {
460  SetVector<SUnit *> Nodes;
461  bool HasRecurrence = false;
462  unsigned RecMII = 0;
463  int MaxMOV = 0;
464  unsigned MaxDepth = 0;
465  unsigned Colocate = 0;
466  SUnit *ExceedPressure = nullptr;
467  unsigned Latency = 0;
468 
469 public:
470  using iterator = SetVector<SUnit *>::const_iterator;
471 
472  NodeSet() = default;
473  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
474  Latency = 0;
475  for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
476  for (const SDep &Succ : Nodes[i]->Succs)
477  if (Nodes.count(Succ.getSUnit()))
478  Latency += Succ.getLatency();
479  }
480 
481  bool insert(SUnit *SU) { return Nodes.insert(SU); }
482 
483  void insert(iterator S, iterator E) { Nodes.insert(S, E); }
484 
485  template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
486  return Nodes.remove_if(P);
487  }
488 
489  unsigned count(SUnit *SU) const { return Nodes.count(SU); }
490 
491  bool hasRecurrence() { return HasRecurrence; };
492 
493  unsigned size() const { return Nodes.size(); }
494 
495  bool empty() const { return Nodes.empty(); }
496 
497  SUnit *getNode(unsigned i) const { return Nodes[i]; };
498 
499  void setRecMII(unsigned mii) { RecMII = mii; };
500 
501  void setColocate(unsigned c) { Colocate = c; };
502 
503  void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
504 
505  bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
506 
507  int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
508 
509  int getRecMII() { return RecMII; }
510 
511  /// Summarize node functions for the entire node set.
512  void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
513  for (SUnit *SU : *this) {
514  MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
515  MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
516  }
517  }
518 
519  unsigned getLatency() { return Latency; }
520 
521  unsigned getMaxDepth() { return MaxDepth; }
522 
523  void clear() {
524  Nodes.clear();
525  RecMII = 0;
526  HasRecurrence = false;
527  MaxMOV = 0;
528  MaxDepth = 0;
529  Colocate = 0;
530  ExceedPressure = nullptr;
531  }
532 
533  operator SetVector<SUnit *> &() { return Nodes; }
534 
535  /// Sort the node sets by importance. First, rank them by recurrence MII,
536  /// then by mobility (least mobile done first), and finally by depth.
537  /// Each node set may contain a colocate value which is used as the first
538  /// tie breaker, if it's set.
539  bool operator>(const NodeSet &RHS) const {
540  if (RecMII == RHS.RecMII) {
541  if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
542  return Colocate < RHS.Colocate;
543  if (MaxMOV == RHS.MaxMOV)
544  return MaxDepth > RHS.MaxDepth;
545  return MaxMOV < RHS.MaxMOV;
546  }
547  return RecMII > RHS.RecMII;
548  }
549 
550  bool operator==(const NodeSet &RHS) const {
551  return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
552  MaxDepth == RHS.MaxDepth;
553  }
554 
555  bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
556 
557  iterator begin() { return Nodes.begin(); }
558  iterator end() { return Nodes.end(); }
559 
560  void print(raw_ostream &os) const {
561  os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
562  << " depth " << MaxDepth << " col " << Colocate << "\n";
563  for (const auto &I : Nodes)
564  os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
565  os << "\n";
566  }
567 
568 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
569  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
570 #endif
571 };
572 
573 /// This class represents the scheduled code. The main data structure is a
574 /// map from scheduled cycle to instructions. During scheduling, the
575 /// data structure explicitly represents all stages/iterations. When
576 /// the algorithm finshes, the schedule is collapsed into a single stage,
577 /// which represents instructions from different loop iterations.
578 ///
579 /// The SMS algorithm allows negative values for cycles, so the first cycle
580 /// in the schedule is the smallest cycle value.
581 class SMSchedule {
582 private:
583  /// Map from execution cycle to instructions.
584  DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
585 
586  /// Map from instruction to execution cycle.
587  std::map<SUnit *, int> InstrToCycle;
588 
589  /// Map for each register and the max difference between its uses and def.
590  /// The first element in the pair is the max difference in stages. The
591  /// second is true if the register defines a Phi value and loop value is
592  /// scheduled before the Phi.
593  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
594 
595  /// Keep track of the first cycle value in the schedule. It starts
596  /// as zero, but the algorithm allows negative values.
597  int FirstCycle = 0;
598 
599  /// Keep track of the last cycle value in the schedule.
600  int LastCycle = 0;
601 
602  /// The initiation interval (II) for the schedule.
603  int InitiationInterval = 0;
604 
605  /// Target machine information.
606  const TargetSubtargetInfo &ST;
607 
608  /// Virtual register information.
610 
611  std::unique_ptr<DFAPacketizer> Resources;
612 
613 public:
614  SMSchedule(MachineFunction *mf)
615  : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
616  Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
617 
618  void reset() {
619  ScheduledInstrs.clear();
620  InstrToCycle.clear();
621  RegToStageDiff.clear();
622  FirstCycle = 0;
623  LastCycle = 0;
624  InitiationInterval = 0;
625  }
626 
627  /// Set the initiation interval for this schedule.
628  void setInitiationInterval(int ii) { InitiationInterval = ii; }
629 
630  /// Return the first cycle in the completed schedule. This
631  /// can be a negative value.
632  int getFirstCycle() const { return FirstCycle; }
633 
634  /// Return the last cycle in the finalized schedule.
635  int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
636 
637  /// Return the cycle of the earliest scheduled instruction in the dependence
638  /// chain.
639  int earliestCycleInChain(const SDep &Dep);
640 
641  /// Return the cycle of the latest scheduled instruction in the dependence
642  /// chain.
643  int latestCycleInChain(const SDep &Dep);
644 
645  void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
646  int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
647  bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
648 
649  /// Iterators for the cycle to instruction map.
650  using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
651  using const_sched_iterator =
652  DenseMap<int, std::deque<SUnit *>>::const_iterator;
653 
654  /// Return true if the instruction is scheduled at the specified stage.
655  bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
656  return (stageScheduled(SU) == (int)StageNum);
657  }
658 
659  /// Return the stage for a scheduled instruction. Return -1 if
660  /// the instruction has not been scheduled.
661  int stageScheduled(SUnit *SU) const {
662  std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
663  if (it == InstrToCycle.end())
664  return -1;
665  return (it->second - FirstCycle) / InitiationInterval;
666  }
667 
668  /// Return the cycle for a scheduled instruction. This function normalizes
669  /// the first cycle to be 0.
670  unsigned cycleScheduled(SUnit *SU) const {
671  std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
672  assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
673  return (it->second - FirstCycle) % InitiationInterval;
674  }
675 
676  /// Return the maximum stage count needed for this schedule.
677  unsigned getMaxStageCount() {
678  return (LastCycle - FirstCycle) / InitiationInterval;
679  }
680 
681  /// Return the max. number of stages/iterations that can occur between a
682  /// register definition and its uses.
683  unsigned getStagesForReg(int Reg, unsigned CurStage) {
684  std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
685  if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
686  return 1;
687  return Stages.first;
688  }
689 
690  /// The number of stages for a Phi is a little different than other
691  /// instructions. The minimum value computed in RegToStageDiff is 1
692  /// because we assume the Phi is needed for at least 1 iteration.
693  /// This is not the case if the loop value is scheduled prior to the
694  /// Phi in the same stage. This function returns the number of stages
695  /// or iterations needed between the Phi definition and any uses.
696  unsigned getStagesForPhi(int Reg) {
697  std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
698  if (Stages.second)
699  return Stages.first;
700  return Stages.first - 1;
701  }
702 
703  /// Return the instructions that are scheduled at the specified cycle.
704  std::deque<SUnit *> &getInstructions(int cycle) {
705  return ScheduledInstrs[cycle];
706  }
707 
708  bool isValidSchedule(SwingSchedulerDAG *SSD);
709  void finalizeSchedule(SwingSchedulerDAG *SSD);
710  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
711  std::deque<SUnit *> &Insts);
712  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
713  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
714  MachineOperand &MO);
715  void print(raw_ostream &os) const;
716  void dump() const;
717 };
718 
719 } // end anonymous namespace
720 
721 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
722 char MachinePipeliner::ID = 0;
723 #ifndef NDEBUG
724 int MachinePipeliner::NumTries = 0;
725 #endif
727 
728 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
729  "Modulo Software Pipelining", false, false)
734 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
735  "Modulo Software Pipelining", false, false)
736 
737 /// The "main" function for implementing Swing Modulo Scheduling.
738 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
739  if (skipFunction(mf.getFunction()))
740  return false;
741 
742  if (!EnableSWP)
743  return false;
744 
745  if (mf.getFunction().getAttributes().hasAttribute(
746  AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
747  !EnableSWPOptSize.getPosition())
748  return false;
749 
750  MF = &mf;
751  MLI = &getAnalysis<MachineLoopInfo>();
752  MDT = &getAnalysis<MachineDominatorTree>();
753  TII = MF->getSubtarget().getInstrInfo();
754  RegClassInfo.runOnMachineFunction(*MF);
755 
756  for (auto &L : *MLI)
757  scheduleLoop(*L);
758 
759  return false;
760 }
761 
762 /// Attempt to perform the SMS algorithm on the specified loop. This function is
763 /// the main entry point for the algorithm. The function identifies candidate
764 /// loops, calculates the minimum initiation interval, and attempts to schedule
765 /// the loop.
766 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
767  bool Changed = false;
768  for (auto &InnerLoop : L)
769  Changed |= scheduleLoop(*InnerLoop);
770 
771 #ifndef NDEBUG
772  // Stop trying after reaching the limit (if any).
773  int Limit = SwpLoopLimit;
774  if (Limit >= 0) {
775  if (NumTries >= SwpLoopLimit)
776  return Changed;
777  NumTries++;
778  }
779 #endif
780 
781  if (!canPipelineLoop(L))
782  return Changed;
783 
784  ++NumTrytoPipeline;
785 
786  Changed = swingModuloScheduler(L);
787 
788  return Changed;
789 }
790 
791 /// Return true if the loop can be software pipelined. The algorithm is
792 /// restricted to loops with a single basic block. Make sure that the
793 /// branch in the loop can be analyzed.
794 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
795  if (L.getNumBlocks() != 1)
796  return false;
797 
798  // Check if the branch can't be understood because we can't do pipelining
799  // if that's the case.
800  LI.TBB = nullptr;
801  LI.FBB = nullptr;
802  LI.BrCond.clear();
803  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
804  return false;
805 
806  LI.LoopInductionVar = nullptr;
807  LI.LoopCompare = nullptr;
808  if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
809  return false;
810 
811  if (!L.getLoopPreheader())
812  return false;
813 
814  // Remove any subregisters from inputs to phi nodes.
815  preprocessPhiNodes(*L.getHeader());
816  return true;
817 }
818 
819 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
820  MachineRegisterInfo &MRI = MF->getRegInfo();
821  SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
822 
823  for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
824  MachineOperand &DefOp = PI.getOperand(0);
825  assert(DefOp.getSubReg() == 0);
826  auto *RC = MRI.getRegClass(DefOp.getReg());
827 
828  for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
829  MachineOperand &RegOp = PI.getOperand(i);
830  if (RegOp.getSubReg() == 0)
831  continue;
832 
833  // If the operand uses a subregister, replace it with a new register
834  // without subregisters, and generate a copy to the new register.
835  unsigned NewReg = MRI.createVirtualRegister(RC);
836  MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
838  const DebugLoc &DL = PredB.findDebugLoc(At);
839  auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
840  .addReg(RegOp.getReg(), getRegState(RegOp),
841  RegOp.getSubReg());
842  Slots.insertMachineInstrInMaps(*Copy);
843  RegOp.setReg(NewReg);
844  RegOp.setSubReg(0);
845  }
846  }
847 }
848 
849 /// The SMS algorithm consists of the following main steps:
850 /// 1. Computation and analysis of the dependence graph.
851 /// 2. Ordering of the nodes (instructions).
852 /// 3. Attempt to Schedule the loop.
853 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
854  assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
855 
856  SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
857 
858  MachineBasicBlock *MBB = L.getHeader();
859  // The kernel should not include any terminator instructions. These
860  // will be added back later.
861  SMS.startBlock(MBB);
862 
863  // Compute the number of 'real' instructions in the basic block by
864  // ignoring terminators.
865  unsigned size = MBB->size();
867  E = MBB->instr_end();
868  I != E; ++I, --size)
869  ;
870 
871  SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
872  SMS.schedule();
873  SMS.exitRegion();
874 
875  SMS.finishBlock();
876  return SMS.hasNewSchedule();
877 }
878 
879 /// We override the schedule function in ScheduleDAGInstrs to implement the
880 /// scheduling part of the Swing Modulo Scheduling algorithm.
881 void SwingSchedulerDAG::schedule() {
882  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
883  buildSchedGraph(AA);
884  addLoopCarriedDependences(AA);
885  updatePhiDependences();
887  postprocessDAG();
888  changeDependences();
889  LLVM_DEBUG(dump());
890 
891  NodeSetType NodeSets;
892  findCircuits(NodeSets);
893  NodeSetType Circuits = NodeSets;
894 
895  // Calculate the MII.
896  unsigned ResMII = calculateResMII();
897  unsigned RecMII = calculateRecMII(NodeSets);
898 
899  fuseRecs(NodeSets);
900 
901  // This flag is used for testing and can cause correctness problems.
902  if (SwpIgnoreRecMII)
903  RecMII = 0;
904 
905  MII = std::max(ResMII, RecMII);
906  LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII
907  << ", res=" << ResMII << ")\n");
908 
909  // Can't schedule a loop without a valid MII.
910  if (MII == 0)
911  return;
912 
913  // Don't pipeline large loops.
914  if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
915  return;
916 
917  computeNodeFunctions(NodeSets);
918 
919  registerPressureFilter(NodeSets);
920 
921  colocateNodeSets(NodeSets);
922 
923  checkNodeSets(NodeSets);
924 
925  LLVM_DEBUG({
926  for (auto &I : NodeSets) {
927  dbgs() << " Rec NodeSet ";
928  I.dump();
929  }
930  });
931 
932  std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
933 
934  groupRemainingNodes(NodeSets);
935 
936  removeDuplicateNodes(NodeSets);
937 
938  LLVM_DEBUG({
939  for (auto &I : NodeSets) {
940  dbgs() << " NodeSet ";
941  I.dump();
942  }
943  });
944 
945  computeNodeOrder(NodeSets);
946 
947  // check for node order issues
948  checkValidNodeOrder(Circuits);
949 
950  SMSchedule Schedule(Pass.MF);
951  Scheduled = schedulePipeline(Schedule);
952 
953  if (!Scheduled)
954  return;
955 
956  unsigned numStages = Schedule.getMaxStageCount();
957  // No need to generate pipeline if there are no overlapped iterations.
958  if (numStages == 0)
959  return;
960 
961  // Check that the maximum stage count is less than user-defined limit.
962  if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
963  return;
964 
965  generatePipelinedLoop(Schedule);
966  ++NumPipelined;
967 }
968 
969 /// Clean up after the software pipeliner runs.
970 void SwingSchedulerDAG::finishBlock() {
971  for (MachineInstr *I : NewMIs)
972  MF.DeleteMachineInstr(I);
973  NewMIs.clear();
974 
975  // Call the superclass.
977 }
978 
979 /// Return the register values for the operands of a Phi instruction.
980 /// This function assume the instruction is a Phi.
981 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
982  unsigned &InitVal, unsigned &LoopVal) {
983  assert(Phi.isPHI() && "Expecting a Phi.");
984 
985  InitVal = 0;
986  LoopVal = 0;
987  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
988  if (Phi.getOperand(i + 1).getMBB() != Loop)
989  InitVal = Phi.getOperand(i).getReg();
990  else
991  LoopVal = Phi.getOperand(i).getReg();
992 
993  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
994 }
995 
996 /// Return the Phi register value that comes from the incoming block.
997 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
998  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
999  if (Phi.getOperand(i + 1).getMBB() != LoopBB)
1000  return Phi.getOperand(i).getReg();
1001  return 0;
1002 }
1003 
1004 /// Return the Phi register value that comes the loop block.
1005 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
1006  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
1007  if (Phi.getOperand(i + 1).getMBB() == LoopBB)
1008  return Phi.getOperand(i).getReg();
1009  return 0;
1010 }
1011 
1012 /// Return true if SUb can be reached from SUa following the chain edges.
1013 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
1014  SmallPtrSet<SUnit *, 8> Visited;
1015  SmallVector<SUnit *, 8> Worklist;
1016  Worklist.push_back(SUa);
1017  while (!Worklist.empty()) {
1018  const SUnit *SU = Worklist.pop_back_val();
1019  for (auto &SI : SU->Succs) {
1020  SUnit *SuccSU = SI.getSUnit();
1021  if (SI.getKind() == SDep::Order) {
1022  if (Visited.count(SuccSU))
1023  continue;
1024  if (SuccSU == SUb)
1025  return true;
1026  Worklist.push_back(SuccSU);
1027  Visited.insert(SuccSU);
1028  }
1029  }
1030  }
1031  return false;
1032 }
1033 
1034 /// Return true if the instruction causes a chain between memory
1035 /// references before and after it.
1037  return MI.isCall() || MI.hasUnmodeledSideEffects() ||
1038  (MI.hasOrderedMemoryRef() &&
1039  (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
1040 }
1041 
1042 /// Return the underlying objects for the memory references of an instruction.
1043 /// This function calls the code in ValueTracking, but first checks that the
1044 /// instruction has a memory operand.
1047  const DataLayout &DL) {
1048  if (!MI->hasOneMemOperand())
1049  return;
1050  MachineMemOperand *MM = *MI->memoperands_begin();
1051  if (!MM->getValue())
1052  return;
1053  GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
1054  for (Value *V : Objs) {
1055  if (!isIdentifiedObject(V)) {
1056  Objs.clear();
1057  return;
1058  }
1059  Objs.push_back(V);
1060  }
1061 }
1062 
1063 /// Add a chain edge between a load and store if the store can be an
1064 /// alias of the load on a subsequent iteration, i.e., a loop carried
1065 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
1066 /// but that code doesn't create loop carried dependences.
1067 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
1069  Value *UnknownValue =
1071  for (auto &SU : SUnits) {
1072  MachineInstr &MI = *SU.getInstr();
1073  if (isDependenceBarrier(MI, AA))
1074  PendingLoads.clear();
1075  else if (MI.mayLoad()) {
1077  getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1078  if (Objs.empty())
1079  Objs.push_back(UnknownValue);
1080  for (auto V : Objs) {
1081  SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
1082  SUs.push_back(&SU);
1083  }
1084  } else if (MI.mayStore()) {
1086  getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1087  if (Objs.empty())
1088  Objs.push_back(UnknownValue);
1089  for (auto V : Objs) {
1091  PendingLoads.find(V);
1092  if (I == PendingLoads.end())
1093  continue;
1094  for (auto Load : I->second) {
1095  if (isSuccOrder(Load, &SU))
1096  continue;
1097  MachineInstr &LdMI = *Load->getInstr();
1098  // First, perform the cheaper check that compares the base register.
1099  // If they are the same and the load offset is less than the store
1100  // offset, then mark the dependence as loop carried potentially.
1101  unsigned BaseReg1, BaseReg2;
1102  int64_t Offset1, Offset2;
1103  if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
1104  TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
1105  if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
1106  assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1107  "What happened to the chain edge?");
1108  SDep Dep(Load, SDep::Barrier);
1109  Dep.setLatency(1);
1110  SU.addPred(Dep);
1111  continue;
1112  }
1113  }
1114  // Second, the more expensive check that uses alias analysis on the
1115  // base registers. If they alias, and the load offset is less than
1116  // the store offset, the mark the dependence as loop carried.
1117  if (!AA) {
1118  SDep Dep(Load, SDep::Barrier);
1119  Dep.setLatency(1);
1120  SU.addPred(Dep);
1121  continue;
1122  }
1123  MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1124  MachineMemOperand *MMO2 = *MI.memoperands_begin();
1125  if (!MMO1->getValue() || !MMO2->getValue()) {
1126  SDep Dep(Load, SDep::Barrier);
1127  Dep.setLatency(1);
1128  SU.addPred(Dep);
1129  continue;
1130  }
1131  if (MMO1->getValue() == MMO2->getValue() &&
1132  MMO1->getOffset() <= MMO2->getOffset()) {
1133  SDep Dep(Load, SDep::Barrier);
1134  Dep.setLatency(1);
1135  SU.addPred(Dep);
1136  continue;
1137  }
1138  AliasResult AAResult = AA->alias(
1140  MMO1->getAAInfo()),
1142  MMO2->getAAInfo()));
1143 
1144  if (AAResult != NoAlias) {
1145  SDep Dep(Load, SDep::Barrier);
1146  Dep.setLatency(1);
1147  SU.addPred(Dep);
1148  }
1149  }
1150  }
1151  }
1152  }
1153 }
1154 
1155 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1156 /// processes dependences for PHIs. This function adds true dependences
1157 /// from a PHI to a use, and a loop carried dependence from the use to the
1158 /// PHI. The loop carried dependence is represented as an anti dependence
1159 /// edge. This function also removes chain dependences between unrelated
1160 /// PHIs.
1161 void SwingSchedulerDAG::updatePhiDependences() {
1162  SmallVector<SDep, 4> RemoveDeps;
1164 
1165  // Iterate over each DAG node.
1166  for (SUnit &I : SUnits) {
1167  RemoveDeps.clear();
1168  // Set to true if the instruction has an operand defined by a Phi.
1169  unsigned HasPhiUse = 0;
1170  unsigned HasPhiDef = 0;
1171  MachineInstr *MI = I.getInstr();
1172  // Iterate over each operand, and we process the definitions.
1173  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1174  MOE = MI->operands_end();
1175  MOI != MOE; ++MOI) {
1176  if (!MOI->isReg())
1177  continue;
1178  unsigned Reg = MOI->getReg();
1179  if (MOI->isDef()) {
1180  // If the register is used by a Phi, then create an anti dependence.
1182  UI = MRI.use_instr_begin(Reg),
1183  UE = MRI.use_instr_end();
1184  UI != UE; ++UI) {
1185  MachineInstr *UseMI = &*UI;
1186  SUnit *SU = getSUnit(UseMI);
1187  if (SU != nullptr && UseMI->isPHI()) {
1188  if (!MI->isPHI()) {
1189  SDep Dep(SU, SDep::Anti, Reg);
1190  Dep.setLatency(1);
1191  I.addPred(Dep);
1192  } else {
1193  HasPhiDef = Reg;
1194  // Add a chain edge to a dependent Phi that isn't an existing
1195  // predecessor.
1196  if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1197  I.addPred(SDep(SU, SDep::Barrier));
1198  }
1199  }
1200  }
1201  } else if (MOI->isUse()) {
1202  // If the register is defined by a Phi, then create a true dependence.
1203  MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1204  if (DefMI == nullptr)
1205  continue;
1206  SUnit *SU = getSUnit(DefMI);
1207  if (SU != nullptr && DefMI->isPHI()) {
1208  if (!MI->isPHI()) {
1209  SDep Dep(SU, SDep::Data, Reg);
1210  Dep.setLatency(0);
1211  ST.adjustSchedDependency(SU, &I, Dep);
1212  I.addPred(Dep);
1213  } else {
1214  HasPhiUse = Reg;
1215  // Add a chain edge to a dependent Phi that isn't an existing
1216  // predecessor.
1217  if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1218  I.addPred(SDep(SU, SDep::Barrier));
1219  }
1220  }
1221  }
1222  }
1223  // Remove order dependences from an unrelated Phi.
1224  if (!SwpPruneDeps)
1225  continue;
1226  for (auto &PI : I.Preds) {
1227  MachineInstr *PMI = PI.getSUnit()->getInstr();
1228  if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1229  if (I.getInstr()->isPHI()) {
1230  if (PMI->getOperand(0).getReg() == HasPhiUse)
1231  continue;
1232  if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1233  continue;
1234  }
1235  RemoveDeps.push_back(PI);
1236  }
1237  }
1238  for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1239  I.removePred(RemoveDeps[i]);
1240  }
1241 }
1242 
1243 /// Iterate over each DAG node and see if we can change any dependences
1244 /// in order to reduce the recurrence MII.
1245 void SwingSchedulerDAG::changeDependences() {
1246  // See if an instruction can use a value from the previous iteration.
1247  // If so, we update the base and offset of the instruction and change
1248  // the dependences.
1249  for (SUnit &I : SUnits) {
1250  unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1251  int64_t NewOffset = 0;
1252  if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1253  NewOffset))
1254  continue;
1255 
1256  // Get the MI and SUnit for the instruction that defines the original base.
1257  unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1258  MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1259  if (!DefMI)
1260  continue;
1261  SUnit *DefSU = getSUnit(DefMI);
1262  if (!DefSU)
1263  continue;
1264  // Get the MI and SUnit for the instruction that defins the new base.
1265  MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1266  if (!LastMI)
1267  continue;
1268  SUnit *LastSU = getSUnit(LastMI);
1269  if (!LastSU)
1270  continue;
1271 
1272  if (Topo.IsReachable(&I, LastSU))
1273  continue;
1274 
1275  // Remove the dependence. The value now depends on a prior iteration.
1276  SmallVector<SDep, 4> Deps;
1277  for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1278  ++P)
1279  if (P->getSUnit() == DefSU)
1280  Deps.push_back(*P);
1281  for (int i = 0, e = Deps.size(); i != e; i++) {
1282  Topo.RemovePred(&I, Deps[i].getSUnit());
1283  I.removePred(Deps[i]);
1284  }
1285  // Remove the chain dependence between the instructions.
1286  Deps.clear();
1287  for (auto &P : LastSU->Preds)
1288  if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1289  Deps.push_back(P);
1290  for (int i = 0, e = Deps.size(); i != e; i++) {
1291  Topo.RemovePred(LastSU, Deps[i].getSUnit());
1292  LastSU->removePred(Deps[i]);
1293  }
1294 
1295  // Add a dependence between the new instruction and the instruction
1296  // that defines the new base.
1297  SDep Dep(&I, SDep::Anti, NewBase);
1298  LastSU->addPred(Dep);
1299 
1300  // Remember the base and offset information so that we can update the
1301  // instruction during code generation.
1302  InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1303  }
1304 }
1305 
1306 namespace {
1307 
1308 // FuncUnitSorter - Comparison operator used to sort instructions by
1309 // the number of functional unit choices.
1310 struct FuncUnitSorter {
1311  const InstrItineraryData *InstrItins;
1312  DenseMap<unsigned, unsigned> Resources;
1313 
1314  FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1315 
1316  // Compute the number of functional unit alternatives needed
1317  // at each stage, and take the minimum value. We prioritize the
1318  // instructions by the least number of choices first.
1319  unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1320  unsigned schedClass = Inst->getDesc().getSchedClass();
1321  unsigned min = UINT_MAX;
1322  for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1323  *IE = InstrItins->endStage(schedClass);
1324  IS != IE; ++IS) {
1325  unsigned funcUnits = IS->getUnits();
1326  unsigned numAlternatives = countPopulation(funcUnits);
1327  if (numAlternatives < min) {
1328  min = numAlternatives;
1329  F = funcUnits;
1330  }
1331  }
1332  return min;
1333  }
1334 
1335  // Compute the critical resources needed by the instruction. This
1336  // function records the functional units needed by instructions that
1337  // must use only one functional unit. We use this as a tie breaker
1338  // for computing the resource MII. The instrutions that require
1339  // the same, highly used, functional unit have high priority.
1340  void calcCriticalResources(MachineInstr &MI) {
1341  unsigned SchedClass = MI.getDesc().getSchedClass();
1342  for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1343  *IE = InstrItins->endStage(SchedClass);
1344  IS != IE; ++IS) {
1345  unsigned FuncUnits = IS->getUnits();
1346  if (countPopulation(FuncUnits) == 1)
1347  Resources[FuncUnits]++;
1348  }
1349  }
1350 
1351  /// Return true if IS1 has less priority than IS2.
1352  bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1353  unsigned F1 = 0, F2 = 0;
1354  unsigned MFUs1 = minFuncUnits(IS1, F1);
1355  unsigned MFUs2 = minFuncUnits(IS2, F2);
1356  if (MFUs1 == 1 && MFUs2 == 1)
1357  return Resources.lookup(F1) < Resources.lookup(F2);
1358  return MFUs1 > MFUs2;
1359  }
1360 };
1361 
1362 } // end anonymous namespace
1363 
1364 /// Calculate the resource constrained minimum initiation interval for the
1365 /// specified loop. We use the DFA to model the resources needed for
1366 /// each instruction, and we ignore dependences. A different DFA is created
1367 /// for each cycle that is required. When adding a new instruction, we attempt
1368 /// to add it to each existing DFA, until a legal space is found. If the
1369 /// instruction cannot be reserved in an existing DFA, we create a new one.
1370 unsigned SwingSchedulerDAG::calculateResMII() {
1372  MachineBasicBlock *MBB = Loop.getHeader();
1373  Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1374 
1375  // Sort the instructions by the number of available choices for scheduling,
1376  // least to most. Use the number of critical resources as the tie breaker.
1377  FuncUnitSorter FUS =
1378  FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1380  E = MBB->getFirstTerminator();
1381  I != E; ++I)
1382  FUS.calcCriticalResources(*I);
1384  FuncUnitOrder(FUS);
1385 
1387  E = MBB->getFirstTerminator();
1388  I != E; ++I)
1389  FuncUnitOrder.push(&*I);
1390 
1391  while (!FuncUnitOrder.empty()) {
1392  MachineInstr *MI = FuncUnitOrder.top();
1393  FuncUnitOrder.pop();
1394  if (TII->isZeroCost(MI->getOpcode()))
1395  continue;
1396  // Attempt to reserve the instruction in an existing DFA. At least one
1397  // DFA is needed for each cycle.
1398  unsigned NumCycles = getSUnit(MI)->Latency;
1399  unsigned ReservedCycles = 0;
1402  for (unsigned C = 0; C < NumCycles; ++C)
1403  while (RI != RE) {
1404  if ((*RI++)->canReserveResources(*MI)) {
1405  ++ReservedCycles;
1406  break;
1407  }
1408  }
1409  // Start reserving resources using existing DFAs.
1410  for (unsigned C = 0; C < ReservedCycles; ++C) {
1411  --RI;
1412  (*RI)->reserveResources(*MI);
1413  }
1414  // Add new DFAs, if needed, to reserve resources.
1415  for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1416  DFAPacketizer *NewResource =
1418  assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1419  NewResource->reserveResources(*MI);
1420  Resources.push_back(NewResource);
1421  }
1422  }
1423  int Resmii = Resources.size();
1424  // Delete the memory for each of the DFAs that were created earlier.
1425  for (DFAPacketizer *RI : Resources) {
1426  DFAPacketizer *D = RI;
1427  delete D;
1428  }
1429  Resources.clear();
1430  return Resmii;
1431 }
1432 
1433 /// Calculate the recurrence-constrainted minimum initiation interval.
1434 /// Iterate over each circuit. Compute the delay(c) and distance(c)
1435 /// for each circuit. The II needs to satisfy the inequality
1436 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1437 /// II that satisfies the inequality, and the RecMII is the maximum
1438 /// of those values.
1439 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1440  unsigned RecMII = 0;
1441 
1442  for (NodeSet &Nodes : NodeSets) {
1443  if (Nodes.empty())
1444  continue;
1445 
1446  unsigned Delay = Nodes.getLatency();
1447  unsigned Distance = 1;
1448 
1449  // ii = ceil(delay / distance)
1450  unsigned CurMII = (Delay + Distance - 1) / Distance;
1451  Nodes.setRecMII(CurMII);
1452  if (CurMII > RecMII)
1453  RecMII = CurMII;
1454  }
1455 
1456  return RecMII;
1457 }
1458 
1459 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1460 /// but we do this to find the circuits, and then change them back.
1461 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1463  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1464  SUnit *SU = &SUnits[i];
1465  for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1466  IP != EP; ++IP) {
1467  if (IP->getKind() != SDep::Anti)
1468  continue;
1469  DepsAdded.push_back(std::make_pair(SU, *IP));
1470  }
1471  }
1472  for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1473  E = DepsAdded.end();
1474  I != E; ++I) {
1475  // Remove this anti dependency and add one in the reverse direction.
1476  SUnit *SU = I->first;
1477  SDep &D = I->second;
1478  SUnit *TargetSU = D.getSUnit();
1479  unsigned Reg = D.getReg();
1480  unsigned Lat = D.getLatency();
1481  SU->removePred(D);
1482  SDep Dep(SU, SDep::Anti, Reg);
1483  Dep.setLatency(Lat);
1484  TargetSU->addPred(Dep);
1485  }
1486 }
1487 
1488 /// Create the adjacency structure of the nodes in the graph.
1489 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1490  SwingSchedulerDAG *DAG) {
1491  BitVector Added(SUnits.size());
1492  DenseMap<int, int> OutputDeps;
1493  for (int i = 0, e = SUnits.size(); i != e; ++i) {
1494  Added.reset();
1495  // Add any successor to the adjacency matrix and exclude duplicates.
1496  for (auto &SI : SUnits[i].Succs) {
1497  // Only create a back-edge on the first and last nodes of a dependence
1498  // chain. This records any chains and adds them later.
1499  if (SI.getKind() == SDep::Output) {
1500  int N = SI.getSUnit()->NodeNum;
1501  int BackEdge = i;
1502  auto Dep = OutputDeps.find(BackEdge);
1503  if (Dep != OutputDeps.end()) {
1504  BackEdge = Dep->second;
1505  OutputDeps.erase(Dep);
1506  }
1507  OutputDeps[N] = BackEdge;
1508  }
1509  // Do not process a boundary node and a back-edge is processed only
1510  // if it goes to a Phi.
1511  if (SI.getSUnit()->isBoundaryNode() ||
1512  (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1513  continue;
1514  int N = SI.getSUnit()->NodeNum;
1515  if (!Added.test(N)) {
1516  AdjK[i].push_back(N);
1517  Added.set(N);
1518  }
1519  }
1520  // A chain edge between a store and a load is treated as a back-edge in the
1521  // adjacency matrix.
1522  for (auto &PI : SUnits[i].Preds) {
1523  if (!SUnits[i].getInstr()->mayStore() ||
1524  !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1525  continue;
1526  if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1527  int N = PI.getSUnit()->NodeNum;
1528  if (!Added.test(N)) {
1529  AdjK[i].push_back(N);
1530  Added.set(N);
1531  }
1532  }
1533  }
1534  }
1535  // Add back-eges in the adjacency matrix for the output dependences.
1536  for (auto &OD : OutputDeps)
1537  if (!Added.test(OD.second)) {
1538  AdjK[OD.first].push_back(OD.second);
1539  Added.set(OD.second);
1540  }
1541 }
1542 
1543 /// Identify an elementary circuit in the dependence graph starting at the
1544 /// specified node.
1545 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1546  bool HasBackedge) {
1547  SUnit *SV = &SUnits[V];
1548  bool F = false;
1549  Stack.insert(SV);
1550  Blocked.set(V);
1551 
1552  for (auto W : AdjK[V]) {
1553  if (NumPaths > MaxPaths)
1554  break;
1555  if (W < S)
1556  continue;
1557  if (W == S) {
1558  if (!HasBackedge)
1559  NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1560  F = true;
1561  ++NumPaths;
1562  break;
1563  } else if (!Blocked.test(W)) {
1564  if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
1565  F = true;
1566  }
1567  }
1568 
1569  if (F)
1570  unblock(V);
1571  else {
1572  for (auto W : AdjK[V]) {
1573  if (W < S)
1574  continue;
1575  if (B[W].count(SV) == 0)
1576  B[W].insert(SV);
1577  }
1578  }
1579  Stack.pop_back();
1580  return F;
1581 }
1582 
1583 /// Unblock a node in the circuit finding algorithm.
1584 void SwingSchedulerDAG::Circuits::unblock(int U) {
1585  Blocked.reset(U);
1586  SmallPtrSet<SUnit *, 4> &BU = B[U];
1587  while (!BU.empty()) {
1589  assert(SI != BU.end() && "Invalid B set.");
1590  SUnit *W = *SI;
1591  BU.erase(W);
1592  if (Blocked.test(W->NodeNum))
1593  unblock(W->NodeNum);
1594  }
1595 }
1596 
1597 /// Identify all the elementary circuits in the dependence graph using
1598 /// Johnson's circuit algorithm.
1599 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1600  // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1601  // but we do this to find the circuits, and then change them back.
1602  swapAntiDependences(SUnits);
1603 
1604  Circuits Cir(SUnits);
1605  // Create the adjacency structure.
1606  Cir.createAdjacencyStructure(this);
1607  for (int i = 0, e = SUnits.size(); i != e; ++i) {
1608  Cir.reset();
1609  Cir.circuit(i, i, NodeSets);
1610  }
1611 
1612  // Change the dependences back so that we've created a DAG again.
1613  swapAntiDependences(SUnits);
1614 }
1615 
1616 /// Return true for DAG nodes that we ignore when computing the cost functions.
1617 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1618 /// in the calculation of the ASAP, ALAP, etc functions.
1619 static bool ignoreDependence(const SDep &D, bool isPred) {
1620  if (D.isArtificial())
1621  return true;
1622  return D.getKind() == SDep::Anti && isPred;
1623 }
1624 
1625 /// Compute several functions need to order the nodes for scheduling.
1626 /// ASAP - Earliest time to schedule a node.
1627 /// ALAP - Latest time to schedule a node.
1628 /// MOV - Mobility function, difference between ALAP and ASAP.
1629 /// D - Depth of each node.
1630 /// H - Height of each node.
1631 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1632  ScheduleInfo.resize(SUnits.size());
1633 
1634  LLVM_DEBUG({
1636  E = Topo.end();
1637  I != E; ++I) {
1638  const SUnit &SU = SUnits[*I];
1639  dumpNode(SU);
1640  }
1641  });
1642 
1643  int maxASAP = 0;
1644  // Compute ASAP and ZeroLatencyDepth.
1646  E = Topo.end();
1647  I != E; ++I) {
1648  int asap = 0;
1649  int zeroLatencyDepth = 0;
1650  SUnit *SU = &SUnits[*I];
1651  for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1652  EP = SU->Preds.end();
1653  IP != EP; ++IP) {
1654  SUnit *pred = IP->getSUnit();
1655  if (IP->getLatency() == 0)
1656  zeroLatencyDepth =
1657  std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1658  if (ignoreDependence(*IP, true))
1659  continue;
1660  asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1661  getDistance(pred, SU, *IP) * MII));
1662  }
1663  maxASAP = std::max(maxASAP, asap);
1664  ScheduleInfo[*I].ASAP = asap;
1665  ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1666  }
1667 
1668  // Compute ALAP, ZeroLatencyHeight, and MOV.
1670  E = Topo.rend();
1671  I != E; ++I) {
1672  int alap = maxASAP;
1673  int zeroLatencyHeight = 0;
1674  SUnit *SU = &SUnits[*I];
1675  for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1676  ES = SU->Succs.end();
1677  IS != ES; ++IS) {
1678  SUnit *succ = IS->getSUnit();
1679  if (IS->getLatency() == 0)
1680  zeroLatencyHeight =
1681  std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1682  if (ignoreDependence(*IS, true))
1683  continue;
1684  alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1685  getDistance(SU, succ, *IS) * MII));
1686  }
1687 
1688  ScheduleInfo[*I].ALAP = alap;
1689  ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1690  }
1691 
1692  // After computing the node functions, compute the summary for each node set.
1693  for (NodeSet &I : NodeSets)
1694  I.computeNodeSetInfo(this);
1695 
1696  LLVM_DEBUG({
1697  for (unsigned i = 0; i < SUnits.size(); i++) {
1698  dbgs() << "\tNode " << i << ":\n";
1699  dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1700  dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1701  dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1702  dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1703  dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1704  dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1705  dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1706  }
1707  });
1708 }
1709 
1710 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1711 /// as the predecessors of the elements of NodeOrder that are not also in
1712 /// NodeOrder.
1713 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1715  const NodeSet *S = nullptr) {
1716  Preds.clear();
1717  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1718  I != E; ++I) {
1719  for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1720  PI != PE; ++PI) {
1721  if (S && S->count(PI->getSUnit()) == 0)
1722  continue;
1723  if (ignoreDependence(*PI, true))
1724  continue;
1725  if (NodeOrder.count(PI->getSUnit()) == 0)
1726  Preds.insert(PI->getSUnit());
1727  }
1728  // Back-edges are predecessors with an anti-dependence.
1729  for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1730  ES = (*I)->Succs.end();
1731  IS != ES; ++IS) {
1732  if (IS->getKind() != SDep::Anti)
1733  continue;
1734  if (S && S->count(IS->getSUnit()) == 0)
1735  continue;
1736  if (NodeOrder.count(IS->getSUnit()) == 0)
1737  Preds.insert(IS->getSUnit());
1738  }
1739  }
1740  return !Preds.empty();
1741 }
1742 
1743 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1744 /// as the successors of the elements of NodeOrder that are not also in
1745 /// NodeOrder.
1746 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1748  const NodeSet *S = nullptr) {
1749  Succs.clear();
1750  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1751  I != E; ++I) {
1752  for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1753  SI != SE; ++SI) {
1754  if (S && S->count(SI->getSUnit()) == 0)
1755  continue;
1756  if (ignoreDependence(*SI, false))
1757  continue;
1758  if (NodeOrder.count(SI->getSUnit()) == 0)
1759  Succs.insert(SI->getSUnit());
1760  }
1761  for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1762  PE = (*I)->Preds.end();
1763  PI != PE; ++PI) {
1764  if (PI->getKind() != SDep::Anti)
1765  continue;
1766  if (S && S->count(PI->getSUnit()) == 0)
1767  continue;
1768  if (NodeOrder.count(PI->getSUnit()) == 0)
1769  Succs.insert(PI->getSUnit());
1770  }
1771  }
1772  return !Succs.empty();
1773 }
1774 
1775 /// Return true if there is a path from the specified node to any of the nodes
1776 /// in DestNodes. Keep track and return the nodes in any path.
1777 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1778  SetVector<SUnit *> &DestNodes,
1779  SetVector<SUnit *> &Exclude,
1780  SmallPtrSet<SUnit *, 8> &Visited) {
1781  if (Cur->isBoundaryNode())
1782  return false;
1783  if (Exclude.count(Cur) != 0)
1784  return false;
1785  if (DestNodes.count(Cur) != 0)
1786  return true;
1787  if (!Visited.insert(Cur).second)
1788  return Path.count(Cur) != 0;
1789  bool FoundPath = false;
1790  for (auto &SI : Cur->Succs)
1791  FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1792  for (auto &PI : Cur->Preds)
1793  if (PI.getKind() == SDep::Anti)
1794  FoundPath |=
1795  computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1796  if (FoundPath)
1797  Path.insert(Cur);
1798  return FoundPath;
1799 }
1800 
1801 /// Return true if Set1 is a subset of Set2.
1802 template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1803  for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1804  if (Set2.count(*I) == 0)
1805  return false;
1806  return true;
1807 }
1808 
1809 /// Compute the live-out registers for the instructions in a node-set.
1810 /// The live-out registers are those that are defined in the node-set,
1811 /// but not used. Except for use operands of Phis.
1813  NodeSet &NS) {
1815  MachineRegisterInfo &MRI = MF.getRegInfo();
1817  SmallSet<unsigned, 4> Uses;
1818  for (SUnit *SU : NS) {
1819  const MachineInstr *MI = SU->getInstr();
1820  if (MI->isPHI())
1821  continue;
1822  for (const MachineOperand &MO : MI->operands())
1823  if (MO.isReg() && MO.isUse()) {
1824  unsigned Reg = MO.getReg();
1826  Uses.insert(Reg);
1827  else if (MRI.isAllocatable(Reg))
1828  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1829  Uses.insert(*Units);
1830  }
1831  }
1832  for (SUnit *SU : NS)
1833  for (const MachineOperand &MO : SU->getInstr()->operands())
1834  if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1835  unsigned Reg = MO.getReg();
1837  if (!Uses.count(Reg))
1838  LiveOutRegs.push_back(RegisterMaskPair(Reg,
1840  } else if (MRI.isAllocatable(Reg)) {
1841  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1842  if (!Uses.count(*Units))
1843  LiveOutRegs.push_back(RegisterMaskPair(*Units,
1845  }
1846  }
1847  RPTracker.addLiveRegs(LiveOutRegs);
1848 }
1849 
1850 /// A heuristic to filter nodes in recurrent node-sets if the register
1851 /// pressure of a set is too high.
1852 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1853  for (auto &NS : NodeSets) {
1854  // Skip small node-sets since they won't cause register pressure problems.
1855  if (NS.size() <= 2)
1856  continue;
1857  IntervalPressure RecRegPressure;
1858  RegPressureTracker RecRPTracker(RecRegPressure);
1859  RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1860  computeLiveOuts(MF, RecRPTracker, NS);
1861  RecRPTracker.closeBottom();
1862 
1863  std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1864  llvm::sort(SUnits.begin(), SUnits.end(),
1865  [](const SUnit *A, const SUnit *B) {
1866  return A->NodeNum > B->NodeNum;
1867  });
1868 
1869  for (auto &SU : SUnits) {
1870  // Since we're computing the register pressure for a subset of the
1871  // instructions in a block, we need to set the tracker for each
1872  // instruction in the node-set. The tracker is set to the instruction
1873  // just after the one we're interested in.
1875  RecRPTracker.setPos(std::next(CurInstI));
1876 
1877  RegPressureDelta RPDelta;
1878  ArrayRef<PressureChange> CriticalPSets;
1879  RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1880  CriticalPSets,
1881  RecRegPressure.MaxSetPressure);
1882  if (RPDelta.Excess.isValid()) {
1883  LLVM_DEBUG(
1884  dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1885  << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1886  << ":" << RPDelta.Excess.getUnitInc());
1887  NS.setExceedPressure(SU);
1888  break;
1889  }
1890  RecRPTracker.recede();
1891  }
1892  }
1893 }
1894 
1895 /// A heuristic to colocate node sets that have the same set of
1896 /// successors.
1897 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1898  unsigned Colocate = 0;
1899  for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1900  NodeSet &N1 = NodeSets[i];
1902  if (N1.empty() || !succ_L(N1, S1))
1903  continue;
1904  for (int j = i + 1; j < e; ++j) {
1905  NodeSet &N2 = NodeSets[j];
1906  if (N1.compareRecMII(N2) != 0)
1907  continue;
1909  if (N2.empty() || !succ_L(N2, S2))
1910  continue;
1911  if (isSubset(S1, S2) && S1.size() == S2.size()) {
1912  N1.setColocate(++Colocate);
1913  N2.setColocate(Colocate);
1914  break;
1915  }
1916  }
1917  }
1918 }
1919 
1920 /// Check if the existing node-sets are profitable. If not, then ignore the
1921 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1922 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
1923 /// then it's best to try to schedule all instructions together instead of
1924 /// starting with the recurrent node-sets.
1925 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1926  // Look for loops with a large MII.
1927  if (MII < 17)
1928  return;
1929  // Check if the node-set contains only a simple add recurrence.
1930  for (auto &NS : NodeSets) {
1931  if (NS.getRecMII() > 2)
1932  return;
1933  if (NS.getMaxDepth() > MII)
1934  return;
1935  }
1936  NodeSets.clear();
1937  LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1938  return;
1939 }
1940 
1941 /// Add the nodes that do not belong to a recurrence set into groups
1942 /// based upon connected componenets.
1943 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1944  SetVector<SUnit *> NodesAdded;
1945  SmallPtrSet<SUnit *, 8> Visited;
1946  // Add the nodes that are on a path between the previous node sets and
1947  // the current node set.
1948  for (NodeSet &I : NodeSets) {
1950  // Add the nodes from the current node set to the previous node set.
1951  if (succ_L(I, N)) {
1952  SetVector<SUnit *> Path;
1953  for (SUnit *NI : N) {
1954  Visited.clear();
1955  computePath(NI, Path, NodesAdded, I, Visited);
1956  }
1957  if (!Path.empty())
1958  I.insert(Path.begin(), Path.end());
1959  }
1960  // Add the nodes from the previous node set to the current node set.
1961  N.clear();
1962  if (succ_L(NodesAdded, N)) {
1963  SetVector<SUnit *> Path;
1964  for (SUnit *NI : N) {
1965  Visited.clear();
1966  computePath(NI, Path, I, NodesAdded, Visited);
1967  }
1968  if (!Path.empty())
1969  I.insert(Path.begin(), Path.end());
1970  }
1971  NodesAdded.insert(I.begin(), I.end());
1972  }
1973 
1974  // Create a new node set with the connected nodes of any successor of a node
1975  // in a recurrent set.
1976  NodeSet NewSet;
1978  if (succ_L(NodesAdded, N))
1979  for (SUnit *I : N)
1980  addConnectedNodes(I, NewSet, NodesAdded);
1981  if (!NewSet.empty())
1982  NodeSets.push_back(NewSet);
1983 
1984  // Create a new node set with the connected nodes of any predecessor of a node
1985  // in a recurrent set.
1986  NewSet.clear();
1987  if (pred_L(NodesAdded, N))
1988  for (SUnit *I : N)
1989  addConnectedNodes(I, NewSet, NodesAdded);
1990  if (!NewSet.empty())
1991  NodeSets.push_back(NewSet);
1992 
1993  // Create new nodes sets with the connected nodes any remaining node that
1994  // has no predecessor.
1995  for (unsigned i = 0; i < SUnits.size(); ++i) {
1996  SUnit *SU = &SUnits[i];
1997  if (NodesAdded.count(SU) == 0) {
1998  NewSet.clear();
1999  addConnectedNodes(SU, NewSet, NodesAdded);
2000  if (!NewSet.empty())
2001  NodeSets.push_back(NewSet);
2002  }
2003  }
2004 }
2005 
2006 /// Add the node to the set, and add all is its connected nodes to the set.
2007 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2008  SetVector<SUnit *> &NodesAdded) {
2009  NewSet.insert(SU);
2010  NodesAdded.insert(SU);
2011  for (auto &SI : SU->Succs) {
2012  SUnit *Successor = SI.getSUnit();
2013  if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
2014  addConnectedNodes(Successor, NewSet, NodesAdded);
2015  }
2016  for (auto &PI : SU->Preds) {
2017  SUnit *Predecessor = PI.getSUnit();
2018  if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2019  addConnectedNodes(Predecessor, NewSet, NodesAdded);
2020  }
2021 }
2022 
2023 /// Return true if Set1 contains elements in Set2. The elements in common
2024 /// are returned in a different container.
2025 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2026  SmallSetVector<SUnit *, 8> &Result) {
2027  Result.clear();
2028  for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
2029  SUnit *SU = Set1[i];
2030  if (Set2.count(SU) != 0)
2031  Result.insert(SU);
2032  }
2033  return !Result.empty();
2034 }
2035 
2036 /// Merge the recurrence node sets that have the same initial node.
2037 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2038  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2039  ++I) {
2040  NodeSet &NI = *I;
2041  for (NodeSetType::iterator J = I + 1; J != E;) {
2042  NodeSet &NJ = *J;
2043  if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2044  if (NJ.compareRecMII(NI) > 0)
2045  NI.setRecMII(NJ.getRecMII());
2046  for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
2047  ++NII)
2048  I->insert(*NII);
2049  NodeSets.erase(J);
2050  E = NodeSets.end();
2051  } else {
2052  ++J;
2053  }
2054  }
2055  }
2056 }
2057 
2058 /// Remove nodes that have been scheduled in previous NodeSets.
2059 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2060  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2061  ++I)
2062  for (NodeSetType::iterator J = I + 1; J != E;) {
2063  J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2064 
2065  if (J->empty()) {
2066  NodeSets.erase(J);
2067  E = NodeSets.end();
2068  } else {
2069  ++J;
2070  }
2071  }
2072 }
2073 
2074 /// Compute an ordered list of the dependence graph nodes, which
2075 /// indicates the order that the nodes will be scheduled. This is a
2076 /// two-level algorithm. First, a partial order is created, which
2077 /// consists of a list of sets ordered from highest to lowest priority.
2078 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2080  NodeOrder.clear();
2081 
2082  for (auto &Nodes : NodeSets) {
2083  LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2084  OrderKind Order;
2086  if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
2087  R.insert(N.begin(), N.end());
2088  Order = BottomUp;
2089  LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2090  } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
2091  R.insert(N.begin(), N.end());
2092  Order = TopDown;
2093  LLVM_DEBUG(dbgs() << " Top down (succs) ");
2094  } else if (isIntersect(N, Nodes, R)) {
2095  // If some of the successors are in the existing node-set, then use the
2096  // top-down ordering.
2097  Order = TopDown;
2098  LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2099  } else if (NodeSets.size() == 1) {
2100  for (auto &N : Nodes)
2101  if (N->Succs.size() == 0)
2102  R.insert(N);
2103  Order = BottomUp;
2104  LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2105  } else {
2106  // Find the node with the highest ASAP.
2107  SUnit *maxASAP = nullptr;
2108  for (SUnit *SU : Nodes) {
2109  if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2110  (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2111  maxASAP = SU;
2112  }
2113  R.insert(maxASAP);
2114  Order = BottomUp;
2115  LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2116  }
2117 
2118  while (!R.empty()) {
2119  if (Order == TopDown) {
2120  // Choose the node with the maximum height. If more than one, choose
2121  // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2122  // choose the node with the lowest MOV.
2123  while (!R.empty()) {
2124  SUnit *maxHeight = nullptr;
2125  for (SUnit *I : R) {
2126  if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2127  maxHeight = I;
2128  else if (getHeight(I) == getHeight(maxHeight) &&
2129  getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2130  maxHeight = I;
2131  else if (getHeight(I) == getHeight(maxHeight) &&
2132  getZeroLatencyHeight(I) ==
2133  getZeroLatencyHeight(maxHeight) &&
2134  getMOV(I) < getMOV(maxHeight))
2135  maxHeight = I;
2136  }
2137  NodeOrder.insert(maxHeight);
2138  LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2139  R.remove(maxHeight);
2140  for (const auto &I : maxHeight->Succs) {
2141  if (Nodes.count(I.getSUnit()) == 0)
2142  continue;
2143  if (NodeOrder.count(I.getSUnit()) != 0)
2144  continue;
2145  if (ignoreDependence(I, false))
2146  continue;
2147  R.insert(I.getSUnit());
2148  }
2149  // Back-edges are predecessors with an anti-dependence.
2150  for (const auto &I : maxHeight->Preds) {
2151  if (I.getKind() != SDep::Anti)
2152  continue;
2153  if (Nodes.count(I.getSUnit()) == 0)
2154  continue;
2155  if (NodeOrder.count(I.getSUnit()) != 0)
2156  continue;
2157  R.insert(I.getSUnit());
2158  }
2159  }
2160  Order = BottomUp;
2161  LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2163  if (pred_L(NodeOrder, N, &Nodes))
2164  R.insert(N.begin(), N.end());
2165  } else {
2166  // Choose the node with the maximum depth. If more than one, choose
2167  // the node with the maximum ZeroLatencyDepth. If still more than one,
2168  // choose the node with the lowest MOV.
2169  while (!R.empty()) {
2170  SUnit *maxDepth = nullptr;
2171  for (SUnit *I : R) {
2172  if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2173  maxDepth = I;
2174  else if (getDepth(I) == getDepth(maxDepth) &&
2175  getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2176  maxDepth = I;
2177  else if (getDepth(I) == getDepth(maxDepth) &&
2178  getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2179  getMOV(I) < getMOV(maxDepth))
2180  maxDepth = I;
2181  }
2182  NodeOrder.insert(maxDepth);
2183  LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2184  R.remove(maxDepth);
2185  if (Nodes.isExceedSU(maxDepth)) {
2186  Order = TopDown;
2187  R.clear();
2188  R.insert(Nodes.getNode(0));
2189  break;
2190  }
2191  for (const auto &I : maxDepth->Preds) {
2192  if (Nodes.count(I.getSUnit()) == 0)
2193  continue;
2194  if (NodeOrder.count(I.getSUnit()) != 0)
2195  continue;
2196  R.insert(I.getSUnit());
2197  }
2198  // Back-edges are predecessors with an anti-dependence.
2199  for (const auto &I : maxDepth->Succs) {
2200  if (I.getKind() != SDep::Anti)
2201  continue;
2202  if (Nodes.count(I.getSUnit()) == 0)
2203  continue;
2204  if (NodeOrder.count(I.getSUnit()) != 0)
2205  continue;
2206  R.insert(I.getSUnit());
2207  }
2208  }
2209  Order = TopDown;
2210  LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2212  if (succ_L(NodeOrder, N, &Nodes))
2213  R.insert(N.begin(), N.end());
2214  }
2215  }
2216  LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2217  }
2218 
2219  LLVM_DEBUG({
2220  dbgs() << "Node order: ";
2221  for (SUnit *I : NodeOrder)
2222  dbgs() << " " << I->NodeNum << " ";
2223  dbgs() << "\n";
2224  });
2225 }
2226 
2227 /// Process the nodes in the computed order and create the pipelined schedule
2228 /// of the instructions, if possible. Return true if a schedule is found.
2229 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2230  if (NodeOrder.empty())
2231  return false;
2232 
2233  bool scheduleFound = false;
2234  // Keep increasing II until a valid schedule is found.
2235  for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
2236  Schedule.reset();
2237  Schedule.setInitiationInterval(II);
2238  LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2239 
2240  SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2241  SetVector<SUnit *>::iterator NE = NodeOrder.end();
2242  do {
2243  SUnit *SU = *NI;
2244 
2245  // Compute the schedule time for the instruction, which is based
2246  // upon the scheduled time for any predecessors/successors.
2247  int EarlyStart = INT_MIN;
2248  int LateStart = INT_MAX;
2249  // These values are set when the size of the schedule window is limited
2250  // due to chain dependences.
2251  int SchedEnd = INT_MAX;
2252  int SchedStart = INT_MIN;
2253  Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2254  II, this);
2255  LLVM_DEBUG({
2256  dbgs() << "Inst (" << SU->NodeNum << ") ";
2257  SU->getInstr()->dump();
2258  dbgs() << "\n";
2259  });
2260  LLVM_DEBUG({
2261  dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2262  << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2263  });
2264 
2265  if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2266  SchedStart > LateStart)
2267  scheduleFound = false;
2268  else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2269  SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2270  scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2271  } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2272  SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2273  scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2274  } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2275  SchedEnd =
2276  std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2277  // When scheduling a Phi it is better to start at the late cycle and go
2278  // backwards. The default order may insert the Phi too far away from
2279  // its first dependence.
2280  if (SU->getInstr()->isPHI())
2281  scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2282  else
2283  scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2284  } else {
2285  int FirstCycle = Schedule.getFirstCycle();
2286  scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2287  FirstCycle + getASAP(SU) + II - 1, II);
2288  }
2289  // Even if we find a schedule, make sure the schedule doesn't exceed the
2290  // allowable number of stages. We keep trying if this happens.
2291  if (scheduleFound)
2292  if (SwpMaxStages > -1 &&
2293  Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2294  scheduleFound = false;
2295 
2296  LLVM_DEBUG({
2297  if (!scheduleFound)
2298  dbgs() << "\tCan't schedule\n";
2299  });
2300  } while (++NI != NE && scheduleFound);
2301 
2302  // If a schedule is found, check if it is a valid schedule too.
2303  if (scheduleFound)
2304  scheduleFound = Schedule.isValidSchedule(this);
2305  }
2306 
2307  LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2308 
2309  if (scheduleFound)
2310  Schedule.finalizeSchedule(this);
2311  else
2312  Schedule.reset();
2313 
2314  return scheduleFound && Schedule.getMaxStageCount() > 0;
2315 }
2316 
2317 /// Given a schedule for the loop, generate a new version of the loop,
2318 /// and replace the old version. This function generates a prolog
2319 /// that contains the initial iterations in the pipeline, and kernel
2320 /// loop, and the epilogue that contains the code for the final
2321 /// iterations.
2322 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2323  // Create a new basic block for the kernel and add it to the CFG.
2324  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2325 
2326  unsigned MaxStageCount = Schedule.getMaxStageCount();
2327 
2328  // Remember the registers that are used in different stages. The index is
2329  // the iteration, or stage, that the instruction is scheduled in. This is
2330  // a map between register names in the original block and the names created
2331  // in each stage of the pipelined loop.
2332  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2333  InstrMapTy InstrMap;
2334 
2336  // Generate the prolog instructions that set up the pipeline.
2337  generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2338  MF.insert(BB->getIterator(), KernelBB);
2339 
2340  // Rearrange the instructions to generate the new, pipelined loop,
2341  // and update register names as needed.
2342  for (int Cycle = Schedule.getFirstCycle(),
2343  LastCycle = Schedule.getFinalCycle();
2344  Cycle <= LastCycle; ++Cycle) {
2345  std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2346  // This inner loop schedules each instruction in the cycle.
2347  for (SUnit *CI : CycleInstrs) {
2348  if (CI->getInstr()->isPHI())
2349  continue;
2350  unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2351  MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2352  updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2353  KernelBB->push_back(NewMI);
2354  InstrMap[NewMI] = CI->getInstr();
2355  }
2356  }
2357 
2358  // Copy any terminator instructions to the new kernel, and update
2359  // names as needed.
2360  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2361  E = BB->instr_end();
2362  I != E; ++I) {
2363  MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2364  updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2365  KernelBB->push_back(NewMI);
2366  InstrMap[NewMI] = &*I;
2367  }
2368 
2369  KernelBB->transferSuccessors(BB);
2370  KernelBB->replaceSuccessor(BB, KernelBB);
2371 
2372  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2373  VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2374  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2375  InstrMap, MaxStageCount, MaxStageCount, false);
2376 
2377  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2378 
2380  // Generate the epilog instructions to complete the pipeline.
2381  generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2382  PrologBBs);
2383 
2384  // We need this step because the register allocation doesn't handle some
2385  // situations well, so we insert copies to help out.
2386  splitLifetimes(KernelBB, EpilogBBs, Schedule);
2387 
2388  // Remove dead instructions due to loop induction variables.
2389  removeDeadInstructions(KernelBB, EpilogBBs);
2390 
2391  // Add branches between prolog and epilog blocks.
2392  addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2393 
2394  // Remove the original loop since it's no longer referenced.
2395  for (auto &I : *BB)
2397  BB->clear();
2398  BB->eraseFromParent();
2399 
2400  delete[] VRMap;
2401 }
2402 
2403 /// Generate the pipeline prolog code.
2404 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2405  MachineBasicBlock *KernelBB,
2406  ValueMapTy *VRMap,
2407  MBBVectorTy &PrologBBs) {
2408  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2409  assert(PreheaderBB != nullptr &&
2410  "Need to add code to handle loops w/o preheader");
2411  MachineBasicBlock *PredBB = PreheaderBB;
2412  InstrMapTy InstrMap;
2413 
2414  // Generate a basic block for each stage, not including the last stage,
2415  // which will be generated in the kernel. Each basic block may contain
2416  // instructions from multiple stages/iterations.
2417  for (unsigned i = 0; i < LastStage; ++i) {
2418  // Create and insert the prolog basic block prior to the original loop
2419  // basic block. The original loop is removed later.
2420  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2421  PrologBBs.push_back(NewBB);
2422  MF.insert(BB->getIterator(), NewBB);
2423  NewBB->transferSuccessors(PredBB);
2424  PredBB->addSuccessor(NewBB);
2425  PredBB = NewBB;
2426 
2427  // Generate instructions for each appropriate stage. Process instructions
2428  // in original program order.
2429  for (int StageNum = i; StageNum >= 0; --StageNum) {
2430  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2431  BBE = BB->getFirstTerminator();
2432  BBI != BBE; ++BBI) {
2433  if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2434  if (BBI->isPHI())
2435  continue;
2436  MachineInstr *NewMI =
2437  cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2438  updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2439  VRMap);
2440  NewBB->push_back(NewMI);
2441  InstrMap[NewMI] = &*BBI;
2442  }
2443  }
2444  }
2445  rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2446  LLVM_DEBUG({
2447  dbgs() << "prolog:\n";
2448  NewBB->dump();
2449  });
2450  }
2451 
2452  PredBB->replaceSuccessor(BB, KernelBB);
2453 
2454  // Check if we need to remove the branch from the preheader to the original
2455  // loop, and replace it with a branch to the new loop.
2456  unsigned numBranches = TII->removeBranch(*PreheaderBB);
2457  if (numBranches) {
2459  TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2460  }
2461 }
2462 
2463 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2464 /// that were started in either the prolog or the kernel. We create a basic
2465 /// block for each stage that needs to complete.
2466 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2467  MachineBasicBlock *KernelBB,
2468  ValueMapTy *VRMap,
2469  MBBVectorTy &EpilogBBs,
2470  MBBVectorTy &PrologBBs) {
2471  // We need to change the branch from the kernel to the first epilog block, so
2472  // this call to analyze branch uses the kernel rather than the original BB.
2473  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2475  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2476  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2477  if (checkBranch)
2478  return;
2479 
2480  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2481  if (*LoopExitI == KernelBB)
2482  ++LoopExitI;
2483  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2484  MachineBasicBlock *LoopExitBB = *LoopExitI;
2485 
2486  MachineBasicBlock *PredBB = KernelBB;
2487  MachineBasicBlock *EpilogStart = LoopExitBB;
2488  InstrMapTy InstrMap;
2489 
2490  // Generate a basic block for each stage, not including the last stage,
2491  // which was generated for the kernel. Each basic block may contain
2492  // instructions from multiple stages/iterations.
2493  int EpilogStage = LastStage + 1;
2494  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2496  EpilogBBs.push_back(NewBB);
2497  MF.insert(BB->getIterator(), NewBB);
2498 
2499  PredBB->replaceSuccessor(LoopExitBB, NewBB);
2500  NewBB->addSuccessor(LoopExitBB);
2501 
2502  if (EpilogStart == LoopExitBB)
2503  EpilogStart = NewBB;
2504 
2505  // Add instructions to the epilog depending on the current block.
2506  // Process instructions in original program order.
2507  for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2508  for (auto &BBI : *BB) {
2509  if (BBI.isPHI())
2510  continue;
2511  MachineInstr *In = &BBI;
2512  if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2513  // Instructions with memoperands in the epilog are updated with
2514  // conservative values.
2515  MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
2516  updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2517  NewBB->push_back(NewMI);
2518  InstrMap[NewMI] = In;
2519  }
2520  }
2521  }
2522  generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2523  VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2524  generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2525  InstrMap, LastStage, EpilogStage, i == 1);
2526  PredBB = NewBB;
2527 
2528  LLVM_DEBUG({
2529  dbgs() << "epilog:\n";
2530  NewBB->dump();
2531  });
2532  }
2533 
2534  // Fix any Phi nodes in the loop exit block.
2535  for (MachineInstr &MI : *LoopExitBB) {
2536  if (!MI.isPHI())
2537  break;
2538  for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2539  MachineOperand &MO = MI.getOperand(i);
2540  if (MO.getMBB() == BB)
2541  MO.setMBB(PredBB);
2542  }
2543  }
2544 
2545  // Create a branch to the new epilog from the kernel.
2546  // Remove the original branch and add a new branch to the epilog.
2547  TII->removeBranch(*KernelBB);
2548  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2549  // Add a branch to the loop exit.
2550  if (EpilogBBs.size() > 0) {
2551  MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2553  TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2554  }
2555 }
2556 
2557 /// Replace all uses of FromReg that appear outside the specified
2558 /// basic block with ToReg.
2559 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2560  MachineBasicBlock *MBB,
2561  MachineRegisterInfo &MRI,
2562  LiveIntervals &LIS) {
2563  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2564  E = MRI.use_end();
2565  I != E;) {
2566  MachineOperand &O = *I;
2567  ++I;
2568  if (O.getParent()->getParent() != MBB)
2569  O.setReg(ToReg);
2570  }
2571  if (!LIS.hasInterval(ToReg))
2572  LIS.createEmptyInterval(ToReg);
2573 }
2574 
2575 /// Return true if the register has a use that occurs outside the
2576 /// specified loop.
2577 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2578  MachineRegisterInfo &MRI) {
2580  E = MRI.use_end();
2581  I != E; ++I)
2582  if (I->getParent()->getParent() != BB)
2583  return true;
2584  return false;
2585 }
2586 
2587 /// Generate Phis for the specific block in the generated pipelined code.
2588 /// This function looks at the Phis from the original code to guide the
2589 /// creation of new Phis.
2590 void SwingSchedulerDAG::generateExistingPhis(
2592  MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2593  InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2594  bool IsLast) {
2595  // Compute the stage number for the initial value of the Phi, which
2596  // comes from the prolog. The prolog to use depends on to which kernel/
2597  // epilog that we're adding the Phi.
2598  unsigned PrologStage = 0;
2599  unsigned PrevStage = 0;
2600  bool InKernel = (LastStageNum == CurStageNum);
2601  if (InKernel) {
2602  PrologStage = LastStageNum - 1;
2603  PrevStage = CurStageNum;
2604  } else {
2605  PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2606  PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2607  }
2608 
2609  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2610  BBE = BB->getFirstNonPHI();
2611  BBI != BBE; ++BBI) {
2612  unsigned Def = BBI->getOperand(0).getReg();
2613 
2614  unsigned InitVal = 0;
2615  unsigned LoopVal = 0;
2616  getPhiRegs(*BBI, BB, InitVal, LoopVal);
2617 
2618  unsigned PhiOp1 = 0;
2619  // The Phi value from the loop body typically is defined in the loop, but
2620  // not always. So, we need to check if the value is defined in the loop.
2621  unsigned PhiOp2 = LoopVal;
2622  if (VRMap[LastStageNum].count(LoopVal))
2623  PhiOp2 = VRMap[LastStageNum][LoopVal];
2624 
2625  int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2626  int LoopValStage =
2627  Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2628  unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2629  if (NumStages == 0) {
2630  // We don't need to generate a Phi anymore, but we need to rename any uses
2631  // of the Phi value.
2632  unsigned NewReg = VRMap[PrevStage][LoopVal];
2633  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2634  Def, InitVal, NewReg);
2635  if (VRMap[CurStageNum].count(LoopVal))
2636  VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2637  }
2638  // Adjust the number of Phis needed depending on the number of prologs left,
2639  // and the distance from where the Phi is first scheduled. The number of
2640  // Phis cannot exceed the number of prolog stages. Each stage can
2641  // potentially define two values.
2642  unsigned MaxPhis = PrologStage + 2;
2643  if (!InKernel && (int)PrologStage <= LoopValStage)
2644  MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
2645  unsigned NumPhis = std::min(NumStages, MaxPhis);
2646 
2647  unsigned NewReg = 0;
2648  unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2649  // In the epilog, we may need to look back one stage to get the correct
2650  // Phi name because the epilog and prolog blocks execute the same stage.
2651  // The correct name is from the previous block only when the Phi has
2652  // been completely scheduled prior to the epilog, and Phi value is not
2653  // needed in multiple stages.
2654  int StageDiff = 0;
2655  if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2656  NumPhis == 1)
2657  StageDiff = 1;
2658  // Adjust the computations below when the phi and the loop definition
2659  // are scheduled in different stages.
2660  if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2661  StageDiff = StageScheduled - LoopValStage;
2662  for (unsigned np = 0; np < NumPhis; ++np) {
2663  // If the Phi hasn't been scheduled, then use the initial Phi operand
2664  // value. Otherwise, use the scheduled version of the instruction. This
2665  // is a little complicated when a Phi references another Phi.
2666  if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2667  PhiOp1 = InitVal;
2668  // Check if the Phi has already been scheduled in a prolog stage.
2669  else if (PrologStage >= AccessStage + StageDiff + np &&
2670  VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2671  PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2672  // Check if the Phi has already been scheduled, but the loop intruction
2673  // is either another Phi, or doesn't occur in the loop.
2674  else if (PrologStage >= AccessStage + StageDiff + np) {
2675  // If the Phi references another Phi, we need to examine the other
2676  // Phi to get the correct value.
2677  PhiOp1 = LoopVal;
2678  MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2679  int Indirects = 1;
2680  while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2681  int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2682  if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2683  PhiOp1 = getInitPhiReg(*InstOp1, BB);
2684  else
2685  PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2686  InstOp1 = MRI.getVRegDef(PhiOp1);
2687  int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2688  int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2689  if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2690  VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2691  PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2692  break;
2693  }
2694  ++Indirects;
2695  }
2696  } else
2697  PhiOp1 = InitVal;
2698  // If this references a generated Phi in the kernel, get the Phi operand
2699  // from the incoming block.
2700  if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2701  if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2702  PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2703 
2704  MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2705  bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2706  // In the epilog, a map lookup is needed to get the value from the kernel,
2707  // or previous epilog block. How is does this depends on if the
2708  // instruction is scheduled in the previous block.
2709  if (!InKernel) {
2710  int StageDiffAdj = 0;
2711  if (LoopValStage != -1 && StageScheduled > LoopValStage)
2712  StageDiffAdj = StageScheduled - LoopValStage;
2713  // Use the loop value defined in the kernel, unless the kernel
2714  // contains the last definition of the Phi.
2715  if (np == 0 && PrevStage == LastStageNum &&
2716  (StageScheduled != 0 || LoopValStage != 0) &&
2717  VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2718  PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2719  // Use the value defined by the Phi. We add one because we switch
2720  // from looking at the loop value to the Phi definition.
2721  else if (np > 0 && PrevStage == LastStageNum &&
2722  VRMap[PrevStage - np + 1].count(Def))
2723  PhiOp2 = VRMap[PrevStage - np + 1][Def];
2724  // Use the loop value defined in the kernel.
2725  else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
2726  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2727  PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2728  // Use the value defined by the Phi, unless we're generating the first
2729  // epilog and the Phi refers to a Phi in a different stage.
2730  else if (VRMap[PrevStage - np].count(Def) &&
2731  (!LoopDefIsPhi || PrevStage != LastStageNum))
2732  PhiOp2 = VRMap[PrevStage - np][Def];
2733  }
2734 
2735  // Check if we can reuse an existing Phi. This occurs when a Phi
2736  // references another Phi, and the other Phi is scheduled in an
2737  // earlier stage. We can try to reuse an existing Phi up until the last
2738  // stage of the current Phi.
2739  if (LoopDefIsPhi) {
2740  if (static_cast<int>(PrologStage - np) >= StageScheduled) {
2741  int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2742  int StageDiff = (StageScheduled - LoopValStage);
2743  LVNumStages -= StageDiff;
2744  // Make sure the loop value Phi has been processed already.
2745  if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
2746  NewReg = PhiOp2;
2747  unsigned ReuseStage = CurStageNum;
2748  if (Schedule.isLoopCarried(this, *PhiInst))
2749  ReuseStage -= LVNumStages;
2750  // Check if the Phi to reuse has been generated yet. If not, then
2751  // there is nothing to reuse.
2752  if (VRMap[ReuseStage - np].count(LoopVal)) {
2753  NewReg = VRMap[ReuseStage - np][LoopVal];
2754 
2755  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2756  &*BBI, Def, NewReg);
2757  // Update the map with the new Phi name.
2758  VRMap[CurStageNum - np][Def] = NewReg;
2759  PhiOp2 = NewReg;
2760  if (VRMap[LastStageNum - np - 1].count(LoopVal))
2761  PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2762 
2763  if (IsLast && np == NumPhis - 1)
2764  replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2765  continue;
2766  }
2767  }
2768  }
2769  if (InKernel && StageDiff > 0 &&
2770  VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2771  PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2772  }
2773 
2774  const TargetRegisterClass *RC = MRI.getRegClass(Def);
2775  NewReg = MRI.createVirtualRegister(RC);
2776 
2777  MachineInstrBuilder NewPhi =
2778  BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2779  TII->get(TargetOpcode::PHI), NewReg);
2780  NewPhi.addReg(PhiOp1).addMBB(BB1);
2781  NewPhi.addReg(PhiOp2).addMBB(BB2);
2782  if (np == 0)
2783  InstrMap[NewPhi] = &*BBI;
2784 
2785  // We define the Phis after creating the new pipelined code, so
2786  // we need to rename the Phi values in scheduled instructions.
2787 
2788  unsigned PrevReg = 0;
2789  if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2790  PrevReg = VRMap[PrevStage - np][LoopVal];
2791  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2792  Def, NewReg, PrevReg);
2793  // If the Phi has been scheduled, use the new name for rewriting.
2794  if (VRMap[CurStageNum - np].count(Def)) {
2795  unsigned R = VRMap[CurStageNum - np][Def];
2796  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2797  R, NewReg);
2798  }
2799 
2800  // Check if we need to rename any uses that occurs after the loop. The
2801  // register to replace depends on whether the Phi is scheduled in the
2802  // epilog.
2803  if (IsLast && np == NumPhis - 1)
2804  replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2805 
2806  // In the kernel, a dependent Phi uses the value from this Phi.
2807  if (InKernel)
2808  PhiOp2 = NewReg;
2809 
2810  // Update the map with the new Phi name.
2811  VRMap[CurStageNum - np][Def] = NewReg;
2812  }
2813 
2814  while (NumPhis++ < NumStages) {
2815  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2816  &*BBI, Def, NewReg, 0);
2817  }
2818 
2819  // Check if we need to rename a Phi that has been eliminated due to
2820  // scheduling.
2821  if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2822  replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2823  }
2824 }
2825 
2826 /// Generate Phis for the specified block in the generated pipelined code.
2827 /// These are new Phis needed because the definition is scheduled after the
2828 /// use in the pipelined sequence.
2829 void SwingSchedulerDAG::generatePhis(
2831  MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2832  InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2833  bool IsLast) {
2834  // Compute the stage number that contains the initial Phi value, and
2835  // the Phi from the previous stage.
2836  unsigned PrologStage = 0;
2837  unsigned PrevStage = 0;
2838  unsigned StageDiff = CurStageNum - LastStageNum;
2839  bool InKernel = (StageDiff == 0);
2840  if (InKernel) {
2841  PrologStage = LastStageNum - 1;
2842  PrevStage = CurStageNum;
2843  } else {
2844  PrologStage = LastStageNum - StageDiff;
2845  PrevStage = LastStageNum + StageDiff - 1;
2846  }
2847 
2848  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2849  BBE = BB->instr_end();
2850  BBI != BBE; ++BBI) {
2851  for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2852  MachineOperand &MO = BBI->getOperand(i);
2853  if (!MO.isReg() || !MO.isDef() ||
2855  continue;
2856 
2857  int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2858  assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2859  unsigned Def = MO.getReg();
2860  unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2861  // An instruction scheduled in stage 0 and is used after the loop
2862  // requires a phi in the epilog for the last definition from either
2863  // the kernel or prolog.
2864  if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2865  hasUseAfterLoop(Def, BB, MRI))
2866  NumPhis = 1;
2867  if (!InKernel && (unsigned)StageScheduled > PrologStage)
2868  continue;
2869 
2870  unsigned PhiOp2 = VRMap[PrevStage][Def];
2871  if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2872  if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2873  PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2874  // The number of Phis can't exceed the number of prolog stages. The
2875  // prolog stage number is zero based.
2876  if (NumPhis > PrologStage + 1 - StageScheduled)
2877  NumPhis = PrologStage + 1 - StageScheduled;
2878  for (unsigned np = 0; np < NumPhis; ++np) {
2879  unsigned PhiOp1 = VRMap[PrologStage][Def];
2880  if (np <= PrologStage)
2881  PhiOp1 = VRMap[PrologStage - np][Def];
2882  if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2883  if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2884  PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2885  if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2886  PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2887  }
2888  if (!InKernel)
2889  PhiOp2 = VRMap[PrevStage - np][Def];
2890 
2891  const TargetRegisterClass *RC = MRI.getRegClass(Def);
2892  unsigned NewReg = MRI.createVirtualRegister(RC);
2893 
2894  MachineInstrBuilder NewPhi =
2895  BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2896  TII->get(TargetOpcode::PHI), NewReg);
2897  NewPhi.addReg(PhiOp1).addMBB(BB1);
2898  NewPhi.addReg(PhiOp2).addMBB(BB2);
2899  if (np == 0)
2900  InstrMap[NewPhi] = &*BBI;
2901 
2902  // Rewrite uses and update the map. The actions depend upon whether
2903  // we generating code for the kernel or epilog blocks.
2904  if (InKernel) {
2905  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2906  &*BBI, PhiOp1, NewReg);
2907  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2908  &*BBI, PhiOp2, NewReg);
2909 
2910  PhiOp2 = NewReg;
2911  VRMap[PrevStage - np - 1][Def] = NewReg;
2912  } else {
2913  VRMap[CurStageNum - np][Def] = NewReg;
2914  if (np == NumPhis - 1)
2915  rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2916  &*BBI, Def, NewReg);
2917  }
2918  if (IsLast && np == NumPhis - 1)
2919  replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2920  }
2921  }
2922  }
2923 }
2924 
2925 /// Remove instructions that generate values with no uses.
2926 /// Typically, these are induction variable operations that generate values
2927 /// used in the loop itself. A dead instruction has a definition with
2928 /// no uses, or uses that occur in the original loop only.
2929 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2930  MBBVectorTy &EpilogBBs) {
2931  // For each epilog block, check that the value defined by each instruction
2932  // is used. If not, delete it.
2933  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2934  MBE = EpilogBBs.rend();
2935  MBB != MBE; ++MBB)
2936  for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2937  ME = (*MBB)->instr_rend();
2938  MI != ME;) {
2939  // From DeadMachineInstructionElem. Don't delete inline assembly.
2940  if (MI->isInlineAsm()) {
2941  ++MI;
2942  continue;
2943  }
2944  bool SawStore = false;
2945  // Check if it's safe to remove the instruction due to side effects.
2946  // We can, and want to, remove Phis here.
2947  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2948  ++MI;
2949  continue;
2950  }
2951  bool used = true;
2952  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2953  MOE = MI->operands_end();
2954  MOI != MOE; ++MOI) {
2955  if (!MOI->isReg() || !MOI->isDef())
2956  continue;
2957  unsigned reg = MOI->getReg();
2958  // Assume physical registers are used, unless they are marked dead.
2960  used = !MOI->isDead();
2961  if (used)
2962  break;
2963  continue;
2964  }
2965  unsigned realUses = 0;
2966  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2967  EI = MRI.use_end();
2968  UI != EI; ++UI) {
2969  // Check if there are any uses that occur only in the original
2970  // loop. If so, that's not a real use.
2971  if (UI->getParent()->getParent() != BB) {
2972  realUses++;
2973  used = true;
2974  break;
2975  }
2976  }
2977  if (realUses > 0)
2978  break;
2979  used = false;
2980  }
2981  if (!used) {
2983  MI++->eraseFromParent();
2984  continue;
2985  }
2986  ++MI;
2987  }
2988  // In the kernel block, check if we can remove a Phi that generates a value
2989  // used in an instruction removed in the epilog block.
2990  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2991  BBE = KernelBB->getFirstNonPHI();
2992  BBI != BBE;) {
2993  MachineInstr *MI = &*BBI;
2994  ++BBI;
2995  unsigned reg = MI->getOperand(0).getReg();
2996  if (MRI.use_begin(reg) == MRI.use_end()) {
2997  LIS.RemoveMachineInstrFromMaps(*MI);
2998  MI->eraseFromParent();
2999  }
3000  }
3001 }
3002 
3003 /// For loop carried definitions, we split the lifetime of a virtual register
3004 /// that has uses past the definition in the next iteration. A copy with a new
3005 /// virtual register is inserted before the definition, which helps with
3006 /// generating a better register assignment.
3007 ///
3008 /// v1 = phi(a, v2) v1 = phi(a, v2)
3009 /// v2 = phi(b, v3) v2 = phi(b, v3)
3010 /// v3 = .. v4 = copy v1
3011 /// .. = V1 v3 = ..
3012 /// .. = v4
3013 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
3014  MBBVectorTy &EpilogBBs,
3015  SMSchedule &Schedule) {
3017  for (auto &PHI : KernelBB->phis()) {
3018  unsigned Def = PHI.getOperand(0).getReg();
3019  // Check for any Phi definition that used as an operand of another Phi
3020  // in the same block.
3022  E = MRI.use_instr_end();
3023  I != E; ++I) {
3024  if (I->isPHI() && I->getParent() == KernelBB) {
3025  // Get the loop carried definition.
3026  unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
3027  if (!LCDef)
3028  continue;
3029  MachineInstr *MI = MRI.getVRegDef(LCDef);
3030  if (!MI || MI->getParent() != KernelBB || MI->isPHI())
3031  continue;
3032  // Search through the rest of the block looking for uses of the Phi
3033  // definition. If one occurs, then split the lifetime.
3034  unsigned SplitReg = 0;
3035  for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
3036  KernelBB->instr_end()))
3037  if (BBJ.readsRegister(Def)) {
3038  // We split the lifetime when we find the first use.
3039  if (SplitReg == 0) {
3040  SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
3041  BuildMI(*KernelBB, MI, MI->getDebugLoc(),
3042  TII->get(TargetOpcode::COPY), SplitReg)
3043  .addReg(Def);
3044  }
3045  BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
3046  }
3047  if (!SplitReg)
3048  continue;
3049  // Search through each of the epilog blocks for any uses to be renamed.
3050  for (auto &Epilog : EpilogBBs)
3051  for (auto &I : *Epilog)
3052  if (I.readsRegister(Def))
3053  I.substituteRegister(Def, SplitReg, 0, *TRI);
3054  break;
3055  }
3056  }
3057  }
3058 }
3059 
3060 /// Remove the incoming block from the Phis in a basic block.
3061 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
3062  for (MachineInstr &MI : *BB) {
3063  if (!MI.isPHI())
3064  break;
3065  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
3066  if (MI.getOperand(i + 1).getMBB() == Incoming) {
3067  MI.RemoveOperand(i + 1);
3068  MI.RemoveOperand(i);
3069  break;
3070  }
3071  }
3072 }
3073 
3074 /// Create branches from each prolog basic block to the appropriate epilog
3075 /// block. These edges are needed if the loop ends before reaching the
3076 /// kernel.
3077 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
3078  MachineBasicBlock *KernelBB,
3079  MBBVectorTy &EpilogBBs,
3080  SMSchedule &Schedule, ValueMapTy *VRMap) {
3081  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
3082  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
3083  MachineInstr *Cmp = Pass.LI.LoopCompare;
3084  MachineBasicBlock *LastPro = KernelBB;
3085  MachineBasicBlock *LastEpi = KernelBB;
3086 
3087  // Start from the blocks connected to the kernel and work "out"
3088  // to the first prolog and the last epilog blocks.
3090  unsigned MaxIter = PrologBBs.size() - 1;
3091  unsigned LC = UINT_MAX;
3092  unsigned LCMin = UINT_MAX;
3093  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
3094  // Add branches to the prolog that go to the corresponding
3095  // epilog, and the fall-thru prolog/kernel block.
3096  MachineBasicBlock *Prolog = PrologBBs[j];
3097  MachineBasicBlock *Epilog = EpilogBBs[i];
3098  // We've executed one iteration, so decrement the loop count and check for
3099  // the loop end.
3101  // Check if the LOOP0 has already been removed. If so, then there is no need
3102  // to reduce the trip count.
3103  if (LC != 0)
3104  LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
3105  MaxIter);
3106 
3107  // Record the value of the first trip count, which is used to determine if
3108  // branches and blocks can be removed for constant trip counts.
3109  if (LCMin == UINT_MAX)
3110  LCMin = LC;
3111 
3112  unsigned numAdded = 0;
3114  Prolog->addSuccessor(Epilog);
3115  numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
3116  } else if (j >= LCMin) {
3117  Prolog->addSuccessor(Epilog);
3118  Prolog->removeSuccessor(LastPro);
3119  LastEpi->removeSuccessor(Epilog);
3120  numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
3121  removePhis(Epilog, LastEpi);
3122  // Remove the blocks that are no longer referenced.
3123  if (LastPro != LastEpi) {
3124  LastEpi->clear();
3125  LastEpi->eraseFromParent();
3126  }
3127  LastPro->clear();
3128  LastPro->eraseFromParent();
3129  } else {
3130  numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
3131  removePhis(Epilog, Prolog);
3132  }
3133  LastPro = Prolog;
3134  LastEpi = Epilog;
3136  E = Prolog->instr_rend();
3137  I != E && numAdded > 0; ++I, --numAdded)
3138  updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3139  }
3140 }
3141 
3142 /// Return true if we can compute the amount the instruction changes
3143 /// during each iteration. Set Delta to the amount of the change.
3144 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3146  unsigned BaseReg;
3147  int64_t Offset;
3148  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3149  return false;
3150 
3151  MachineRegisterInfo &MRI = MF.getRegInfo();
3152  // Check if there is a Phi. If so, get the definition in the loop.
3153  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3154  if (BaseDef && BaseDef->isPHI()) {
3155  BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3156  BaseDef = MRI.getVRegDef(BaseReg);
3157  }
3158  if (!BaseDef)
3159  return false;
3160 
3161  int D = 0;
3162  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
3163  return false;
3164 
3165  Delta = D;
3166  return true;
3167 }
3168 
3169 /// Update the memory operand with a new offset when the pipeliner
3170 /// generates a new copy of the instruction that refers to a
3171 /// different memory location.
3172 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3173  MachineInstr &OldMI, unsigned Num) {
3174  if (Num == 0)
3175  return;
3176  // If the instruction has memory operands, then adjust the offset
3177  // when the instruction appears in different stages.
3178  if (NewMI.memoperands_empty())
3179  return;
3181  for (MachineMemOperand *MMO : NewMI.memoperands()) {
3182  if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
3183  (!MMO->getValue())) {
3184  NewMMOs.push_back(MMO);
3185  continue;
3186  }
3187  unsigned Delta;
3188  if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
3189  int64_t AdjOffset = Delta * Num;
3190  NewMMOs.push_back(
3191  MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
3192  } else {
3193  NewMMOs.push_back(
3195  }
3196  }
3197  NewMI.setMemRefs(MF, NewMMOs);
3198 }
3199 
3200 /// Clone the instruction for the new pipelined loop and update the
3201 /// memory operands, if needed.
3202 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3203  unsigned CurStageNum,
3204  unsigned InstStageNum) {
3205  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3206  // Check for tied operands in inline asm instructions. This should be handled
3207  // elsewhere, but I'm not sure of the best solution.
3208  if (OldMI->isInlineAsm())
3209  for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
3210  const auto &MO = OldMI->getOperand(i);
3211  if (MO.isReg() && MO.isUse())
3212  break;
3213  unsigned UseIdx;
3214  if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3215  NewMI->tieOperands(i, UseIdx);
3216  }
3217  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3218  return NewMI;
3219 }
3220 
3221 /// Clone the instruction for the new pipelined loop. If needed, this
3222 /// function updates the instruction using the values saved in the
3223 /// InstrChanges structure.
3224 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3225  unsigned CurStageNum,
3226  unsigned InstStageNum,
3227  SMSchedule &Schedule) {
3228  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3230  InstrChanges.find(getSUnit(OldMI));
3231  if (It != InstrChanges.end()) {
3232  std::pair<unsigned, int64_t> RegAndOffset = It->second;
3233  unsigned BasePos, OffsetPos;
3234  if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
3235  return nullptr;
3236  int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3237  MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3238  if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3239  NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3240  NewMI->getOperand(OffsetPos).setImm(NewOffset);
3241  }
3242  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3243  return NewMI;
3244 }
3245 
3246 /// Update the machine instruction with new virtual registers. This
3247 /// function may change the defintions and/or uses.
3248 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3249  unsigned CurStageNum,
3250  unsigned InstrStageNum,
3251  SMSchedule &Schedule,
3252  ValueMapTy *VRMap) {
3253  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
3254  MachineOperand &MO = NewMI->getOperand(i);
3256  continue;
3257  unsigned reg = MO.getReg();
3258  if (MO.isDef()) {
3259  // Create a new virtual register for the definition.
3260  const TargetRegisterClass *RC = MRI.getRegClass(reg);
3261  unsigned NewReg = MRI.createVirtualRegister(RC);
3262  MO.setReg(NewReg);
3263  VRMap[CurStageNum][reg] = NewReg;
3264  if (LastDef)
3265  replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3266  } else if (MO.isUse()) {
3267  MachineInstr *Def = MRI.getVRegDef(reg);
3268  // Compute the stage that contains the last definition for instruction.
3269  int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3270  unsigned StageNum = CurStageNum;
3271  if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
3272  // Compute the difference in stages between the defintion and the use.
3273  unsigned StageDiff = (InstrStageNum - DefStageNum);
3274  // Make an adjustment to get the last definition.
3275  StageNum -= StageDiff;
3276  }
3277  if (VRMap[StageNum].count(reg))
3278  MO.setReg(VRMap[StageNum][reg]);
3279  }
3280  }
3281 }
3282 
3283 /// Return the instruction in the loop that defines the register.
3284 /// If the definition is a Phi, then follow the Phi operand to
3285 /// the instruction in the loop.
3286 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3288  MachineInstr *Def = MRI.getVRegDef(Reg);
3289  while (Def->isPHI()) {
3290  if (!Visited.insert(Def).second)
3291  break;
3292  for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3293  if (Def->getOperand(i + 1).getMBB() == BB) {
3294  Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3295  break;
3296  }
3297  }
3298  return Def;
3299 }
3300 
3301 /// Return the new name for the value from the previous stage.
3302 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3303  unsigned LoopVal, unsigned LoopStage,
3304  ValueMapTy *VRMap,
3305  MachineBasicBlock *BB) {
3306  unsigned PrevVal = 0;
3307  if (StageNum > PhiStage) {
3308  MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3309  if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3310  // The name is defined in the previous stage.
3311  PrevVal = VRMap[StageNum - 1][LoopVal];
3312  else if (VRMap[StageNum].count(LoopVal))
3313  // The previous name is defined in the current stage when the instruction
3314  // order is swapped.
3315  PrevVal = VRMap[StageNum][LoopVal];
3316  else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
3317  // The loop value hasn't yet been scheduled.
3318  PrevVal = LoopVal;
3319  else if (StageNum == PhiStage + 1)
3320  // The loop value is another phi, which has not been scheduled.
3321  PrevVal = getInitPhiReg(*LoopInst, BB);
3322  else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3323  // The loop value is another phi, which has been scheduled.
3324  PrevVal =
3325  getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3326  LoopStage, VRMap, BB);
3327  }
3328  return PrevVal;
3329 }
3330 
3331 /// Rewrite the Phi values in the specified block to use the mappings
3332 /// from the initial operand. Once the Phi is scheduled, we switch
3333 /// to using the loop value instead of the Phi value, so those names
3334 /// do not need to be rewritten.
3335 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3336  unsigned StageNum,
3337  SMSchedule &Schedule,
3338  ValueMapTy *VRMap,
3339  InstrMapTy &InstrMap) {
3340  for (auto &PHI : BB->phis()) {
3341  unsigned InitVal = 0;
3342  unsigned LoopVal = 0;
3343  getPhiRegs(PHI, BB, InitVal, LoopVal);
3344  unsigned PhiDef = PHI.getOperand(0).getReg();
3345 
3346  unsigned PhiStage =
3347  (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3348  unsigned LoopStage =
3349  (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3350  unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3351  if (NumPhis > StageNum)
3352  NumPhis = StageNum;
3353  for (unsigned np = 0; np <= NumPhis; ++np) {
3354  unsigned NewVal =
3355  getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3356  if (!NewVal)
3357  NewVal = InitVal;
3358  rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
3359  PhiDef, NewVal);
3360  }
3361  }
3362 }
3363 
3364 /// Rewrite a previously scheduled instruction to use the register value
3365 /// from the new instruction. Make sure the instruction occurs in the
3366 /// basic block, and we don't change the uses in the new instruction.
3367 void SwingSchedulerDAG::rewriteScheduledInstr(
3368  MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3369  unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3370  unsigned NewReg, unsigned PrevReg) {
3371  bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3372  int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3373  // Rewrite uses that have been scheduled already to use the new
3374  // Phi register.
3375  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3376  EI = MRI.use_end();
3377  UI != EI;) {
3378  MachineOperand &UseOp = *UI;
3379  MachineInstr *UseMI = UseOp.getParent();
3380  ++UI;
3381  if (UseMI->getParent() != BB)
3382  continue;
3383  if (UseMI->isPHI()) {
3384  if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3385  continue;
3386  if (getLoopPhiReg(*UseMI, BB) != OldReg)
3387  continue;
3388  }
3389  InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3390  assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3391  SUnit *OrigMISU = getSUnit(OrigInstr->second);
3392  int StageSched = Schedule.stageScheduled(OrigMISU);
3393  int CycleSched = Schedule.cycleScheduled(OrigMISU);
3394  unsigned ReplaceReg = 0;
3395  // This is the stage for the scheduled instruction.
3396  if (StagePhi == StageSched && Phi->isPHI()) {
3397  int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3398  if (PrevReg && InProlog)
3399  ReplaceReg = PrevReg;
3400  else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3401  (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3402  ReplaceReg = PrevReg;
3403  else
3404  ReplaceReg = NewReg;
3405  }
3406  // The scheduled instruction occurs before the scheduled Phi, and the
3407  // Phi is not loop carried.
3408  if (!InProlog && StagePhi + 1 == StageSched &&
3409  !Schedule.isLoopCarried(this, *Phi))
3410  ReplaceReg = NewReg;
3411  if (StagePhi > StageSched && Phi->isPHI())
3412  ReplaceReg = NewReg;
3413  if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3414  ReplaceReg = NewReg;
3415  if (ReplaceReg) {
3416  MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3417  UseOp.setReg(ReplaceReg);
3418  }
3419  }
3420 }
3421 
3422 /// Check if we can change the instruction to use an offset value from the
3423 /// previous iteration. If so, return true and set the base and offset values
3424 /// so that we can rewrite the load, if necessary.
3425 /// v1 = Phi(v0, v3)
3426 /// v2 = load v1, 0
3427 /// v3 = post_store v1, 4, x
3428 /// This function enables the load to be rewritten as v2 = load v3, 4.
3429 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3430  unsigned &BasePos,
3431  unsigned &OffsetPos,
3432  unsigned &NewBase,
3433  int64_t &Offset) {
3434  // Get the load instruction.
3435  if (TII->isPostIncrement(*MI))
3436  return false;
3437  unsigned BasePosLd, OffsetPosLd;
3438  if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3439  return false;
3440  unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3441 
3442  // Look for the Phi instruction.
3443  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3444  MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3445  if (!Phi || !Phi->isPHI())
3446  return false;
3447  // Get the register defined in the loop block.
3448  unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3449  if (!PrevReg)
3450  return false;
3451 
3452  // Check for the post-increment load/store instruction.
3453  MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3454  if (!PrevDef || PrevDef == MI)
3455  return false;
3456 
3457  if (!TII->isPostIncrement(*PrevDef))
3458  return false;
3459 
3460  unsigned BasePos1 = 0, OffsetPos1 = 0;
3461  if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3462  return false;
3463 
3464  // Make sure that the instructions do not access the same memory location in
3465  // the next iteration.
3466  int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3467  int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3468  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3469  NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3470  bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3471  MF.DeleteMachineInstr(NewMI);
3472  if (!Disjoint)
3473  return false;
3474 
3475  // Set the return value once we determine that we return true.
3476  BasePos = BasePosLd;
3477  OffsetPos = OffsetPosLd;
3478  NewBase = PrevReg;
3479  Offset = StoreOffset;
3480  return true;
3481 }
3482 
3483 /// Apply changes to the instruction if needed. The changes are need
3484 /// to improve the scheduling and depend up on the final schedule.
3485 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3486  SMSchedule &Schedule) {
3487  SUnit *SU = getSUnit(MI);
3489  InstrChanges.find(SU);
3490  if (It != InstrChanges.end()) {
3491  std::pair<unsigned, int64_t> RegAndOffset = It->second;
3492  unsigned BasePos, OffsetPos;
3493  if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3494  return;
3495  unsigned BaseReg = MI->getOperand(BasePos).getReg();
3496  MachineInstr *LoopDef = findDefInLoop(BaseReg);
3497  int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3498  int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3499  int BaseStageNum = Schedule.stageScheduled(SU);
3500  int BaseCycleNum = Schedule.cycleScheduled(SU);
3501  if (BaseStageNum < DefStageNum) {
3502  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3503  int OffsetDiff = DefStageNum - BaseStageNum;
3504  if (DefCycleNum < BaseCycleNum) {
3505  NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3506  if (OffsetDiff > 0)
3507  --OffsetDiff;
3508  }
3509  int64_t NewOffset =
3510  MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3511  NewMI->getOperand(OffsetPos).setImm(NewOffset);
3512  SU->setInstr(NewMI);
3513  MISUnitMap[NewMI] = SU;
3514  NewMIs.insert(NewMI);
3515  }
3516  }
3517 }
3518 
3519 /// Return true for an order or output dependence that is loop carried
3520 /// potentially. A dependence is loop carried if the destination defines a valu
3521 /// that may be used or defined by the source in a subsequent iteration.
3522 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
3523  bool isSucc) {
3524  if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
3525  Dep.isArtificial())
3526  return false;
3527 
3528  if (!SwpPruneLoopCarried)
3529  return true;
3530 
3531  if (Dep.getKind() == SDep::Output)
3532  return true;
3533 
3534  MachineInstr *SI = Source->getInstr();
3535  MachineInstr *DI = Dep.getSUnit()->getInstr();
3536  if (!isSucc)
3537  std::swap(SI, DI);
3538  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3539 
3540  // Assume ordered loads and stores may have a loop carried dependence.
3541  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3543  return true;
3544 
3545  // Only chain dependences between a load and store can be loop carried.
3546  if (!DI->mayStore() || !SI->mayLoad())
3547  return false;
3548 
3549  unsigned DeltaS, DeltaD;
3550  if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3551  return true;
3552 
3553  unsigned BaseRegS, BaseRegD;
3554  int64_t OffsetS, OffsetD;
3556  if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
3557  !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
3558  return true;
3559 
3560  if (BaseRegS != BaseRegD)
3561  return true;
3562 
3563  // Check that the base register is incremented by a constant value for each
3564  // iteration.
3565  MachineInstr *Def = MRI.getVRegDef(BaseRegS);
3566  if (!Def || !Def->isPHI())
3567  return true;
3568  unsigned InitVal = 0;
3569  unsigned LoopVal = 0;
3570  getPhiRegs(*Def, BB, InitVal, LoopVal);
3571  MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
3572  int D = 0;
3573  if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
3574  return true;
3575 
3576  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3577  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3578 
3579  // This is the main test, which checks the offset values and the loop
3580  // increment value to determine if the accesses may be loop carried.
3581  if (OffsetS >= OffsetD)
3582  return OffsetS + AccessSizeS > DeltaS;
3583  else
3584  return OffsetD + AccessSizeD > DeltaD;
3585 
3586  return true;
3587 }
3588 
3589 void SwingSchedulerDAG::postprocessDAG() {
3590  for (auto &M : Mutations)
3591  M->apply(this);
3592 }
3593 
3594 /// Try to schedule the node at the specified StartCycle and continue
3595 /// until the node is schedule or the EndCycle is reached. This function
3596 /// returns true if the node is scheduled. This routine may search either
3597 /// forward or backward for a place to insert the instruction based upon
3598 /// the relative values of StartCycle and EndCycle.
3599 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3600  bool forward = true;
3601  if (StartCycle > EndCycle)
3602  forward = false;
3603 
3604  // The terminating condition depends on the direction.
3605  int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3606  for (int curCycle = StartCycle; curCycle != termCycle;
3607  forward ? ++curCycle : --curCycle) {
3608 
3609  // Add the already scheduled instructions at the specified cycle to the DFA.
3610  Resources->clearResources();
3611  for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3612  checkCycle <= LastCycle; checkCycle += II) {
3613  std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3614 
3615  for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3616  E = cycleInstrs.end();
3617  I != E; ++I) {
3618  if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3619  continue;
3620  assert(Resources->canReserveResources(*(*I)->getInstr()) &&
3621  "These instructions have already been scheduled.");
3622  Resources->reserveResources(*(*I)->getInstr());
3623  }
3624  }
3625  if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3626  Resources->canReserveResources(*SU->getInstr())) {
3627  LLVM_DEBUG({
3628  dbgs() << "\tinsert at cycle " << curCycle << " ";
3629  SU->getInstr()->dump();
3630  });
3631 
3632  ScheduledInstrs[curCycle].push_back(SU);
3633  InstrToCycle.insert(std::make_pair(SU, curCycle));
3634  if (curCycle > LastCycle)
3635  LastCycle = curCycle;
3636  if (curCycle < FirstCycle)
3637  FirstCycle = curCycle;
3638  return true;
3639  }
3640  LLVM_DEBUG({
3641  dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3642  SU->getInstr()->dump();
3643  });
3644  }
3645  return false;
3646 }
3647 
3648 // Return the cycle of the earliest scheduled instruction in the chain.
3649 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3650  SmallPtrSet<SUnit *, 8> Visited;
3651  SmallVector<SDep, 8> Worklist;
3652  Worklist.push_back(Dep);
3653  int EarlyCycle = INT_MAX;
3654  while (!Worklist.empty()) {
3655  const SDep &Cur = Worklist.pop_back_val();
3656  SUnit *PrevSU = Cur.getSUnit();
3657  if (Visited.count(PrevSU))
3658  continue;
3659  std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3660  if (it == InstrToCycle.end())
3661  continue;
3662  EarlyCycle = std::min(EarlyCycle, it->second);
3663  for (const auto &PI : PrevSU->Preds)
3664  if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3665  Worklist.push_back(PI);
3666  Visited.insert(PrevSU);
3667  }
3668  return EarlyCycle;
3669 }
3670 
3671 // Return the cycle of the latest scheduled instruction in the chain.
3672 int SMSchedule::latestCycleInChain(const SDep &Dep) {
3673  SmallPtrSet<SUnit *, 8> Visited;
3674  SmallVector<SDep, 8> Worklist;
3675  Worklist.push_back(Dep);
3676  int LateCycle = INT_MIN;
3677  while (!Worklist.empty()) {
3678  const SDep &Cur = Worklist.pop_back_val();
3679  SUnit *SuccSU = Cur.getSUnit();
3680  if (Visited.count(SuccSU))
3681  continue;
3682  std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3683  if (it == InstrToCycle.end())
3684  continue;
3685  LateCycle = std::max(LateCycle, it->second);
3686  for (const auto &SI : SuccSU->Succs)
3687  if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3688  Worklist.push_back(SI);
3689  Visited.insert(SuccSU);
3690  }
3691  return LateCycle;
3692 }
3693 
3694 /// If an instruction has a use that spans multiple iterations, then
3695 /// return true. These instructions are characterized by having a back-ege
3696 /// to a Phi, which contains a reference to another Phi.
3697 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3698  for (auto &P : SU->Preds)
3699  if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3700  for (auto &S : P.getSUnit()->Succs)
3701  if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3702  return P.getSUnit();
3703  return nullptr;
3704 }
3705 
3706 /// Compute the scheduling start slot for the instruction. The start slot
3707 /// depends on any predecessor or successor nodes scheduled already.
3708 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3709  int *MinEnd, int *MaxStart, int II,
3710  SwingSchedulerDAG *DAG) {
3711  // Iterate over each instruction that has been scheduled already. The start
3712  // slot computation depends on whether the previously scheduled instruction
3713  // is a predecessor or successor of the specified instruction.
3714  for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3715 
3716  // Iterate over each instruction in the current cycle.
3717  for (SUnit *I : getInstructions(cycle)) {
3718  // Because we're processing a DAG for the dependences, we recognize
3719  // the back-edge in recurrences by anti dependences.
3720  for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3721  const SDep &Dep = SU->Preds[i];
3722  if (Dep.getSUnit() == I) {
3723  if (!DAG->isBackedge(SU, Dep)) {
3724  int EarlyStart = cycle + Dep.getLatency() -
3725  DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3726  *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3727  if (DAG->isLoopCarriedDep(SU, Dep, false)) {
3728  int End = earliestCycleInChain(Dep) + (II - 1);
3729  *MinEnd = std::min(*MinEnd, End);
3730  }
3731  } else {
3732  int LateStart = cycle - Dep.getLatency() +
3733  DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3734  *MinLateStart = std::min(*MinLateStart, LateStart);
3735  }
3736  }
3737  // For instruction that requires multiple iterations, make sure that
3738  // the dependent instruction is not scheduled past the definition.
3739  SUnit *BE = multipleIterations(I, DAG);
3740  if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3741  !SU->isPred(I))
3742  *MinLateStart = std::min(*MinLateStart, cycle);
3743  }
3744  for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
3745  if (SU->Succs[i].getSUnit() == I) {
3746  const SDep &Dep = SU->Succs[i];
3747  if (!DAG->isBackedge(SU, Dep)) {
3748  int LateStart = cycle - Dep.getLatency() +
3749  DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3750  *MinLateStart = std::min(*MinLateStart, LateStart);
3751  if (DAG->isLoopCarriedDep(SU, Dep)) {
3752  int Start = latestCycleInChain(Dep) + 1 - II;
3753  *MaxStart = std::max(*MaxStart, Start);
3754  }
3755  } else {
3756  int EarlyStart = cycle + Dep.getLatency() -
3757  DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3758  *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3759  }
3760  }
3761  }
3762  }
3763  }
3764 }
3765 
3766 /// Order the instructions within a cycle so that the definitions occur
3767 /// before the uses. Returns true if the instruction is added to the start
3768 /// of the list, or false if added to the end.
3769 void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3770  std::deque<SUnit *> &Insts) {
3771  MachineInstr *MI = SU->getInstr();
3772  bool OrderBeforeUse = false;
3773  bool OrderAfterDef = false;
3774  bool OrderBeforeDef = false;
3775  unsigned MoveDef = 0;
3776  unsigned MoveUse = 0;
3777  int StageInst1 = stageScheduled(SU);
3778 
3779  unsigned Pos = 0;
3780  for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3781  ++I, ++Pos) {
3782  for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3783  MachineOperand &MO = MI->getOperand(i);
3785  continue;
3786 
3787  unsigned Reg = MO.getReg();
3788  unsigned BasePos, OffsetPos;
3789  if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3790  if (MI->getOperand(BasePos).getReg() == Reg)
3791  if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3792  Reg = NewReg;
3793  bool Reads, Writes;
3794  std::tie(Reads, Writes) =
3795  (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3796  if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3797  OrderBeforeUse = true;
3798  if (MoveUse == 0)
3799  MoveUse = Pos;
3800  } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3801  // Add the instruction after the scheduled instruction.
3802  OrderAfterDef = true;
3803  MoveDef = Pos;
3804  } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3805  if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3806  OrderBeforeUse = true;
3807  if (MoveUse == 0)
3808  MoveUse = Pos;
3809  } else {
3810  OrderAfterDef = true;
3811  MoveDef = Pos;
3812  }
3813  } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3814  OrderBeforeUse = true;
3815  if (MoveUse == 0)
3816  MoveUse = Pos;
3817  if (MoveUse != 0) {
3818  OrderAfterDef = true;
3819  MoveDef = Pos - 1;
3820  }
3821  } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3822  // Add the instruction before the scheduled instruction.
3823  OrderBeforeUse = true;
3824  if (MoveUse == 0)
3825  MoveUse = Pos;
3826  } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3827  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3828  if (MoveUse == 0) {
3829  OrderBeforeDef = true;
3830  MoveUse = Pos;
3831  }
3832  }
3833  }
3834  // Check for order dependences between instructions. Make sure the source
3835  // is ordered before the destination.
3836  for (auto &S : SU->Succs) {
3837  if (S.getSUnit() != *I)
3838  continue;
3839  if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3840  OrderBeforeUse = true;
3841  if (Pos < MoveUse)
3842  MoveUse = Pos;
3843  }
3844  }
3845  for (auto &P : SU->Preds) {
3846  if (P.getSUnit() != *I)
3847  continue;
3848  if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3849  OrderAfterDef = true;
3850  MoveDef = Pos;
3851  }
3852  }
3853  }
3854 
3855  // A circular dependence.
3856  if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3857  OrderBeforeUse = false;
3858 
3859  // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3860  // to a loop-carried dependence.
3861  if (OrderBeforeDef)
3862  OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3863 
3864  // The uncommon case when the instruction order needs to be updated because
3865  // there is both a use and def.
3866  if (OrderBeforeUse && OrderAfterDef) {
3867  SUnit *UseSU = Insts.at(MoveUse);
3868  SUnit *DefSU = Insts.at(MoveDef);
3869  if (MoveUse > MoveDef) {
3870  Insts.erase(Insts.begin() + MoveUse);
3871  Insts.erase(Insts.begin() + MoveDef);
3872  } else {
3873  Insts.erase(Insts.begin() + MoveDef);
3874  Insts.erase(Insts.begin() + MoveUse);
3875  }
3876  orderDependence(SSD, UseSU, Insts);
3877  orderDependence(SSD, SU, Insts);
3878  orderDependence(SSD, DefSU, Insts);
3879  return;
3880  }
3881  // Put the new instruction first if there is a use in the list. Otherwise,
3882  // put it at the end of the list.
3883  if (OrderBeforeUse)
3884  Insts.push_front(SU);
3885  else
3886  Insts.push_back(SU);
3887 }
3888 
3889 /// Return true if the scheduled Phi has a loop carried operand.
3890 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3891  if (!Phi.isPHI())
3892  return false;
3893  assert(Phi.isPHI() && "Expecting a Phi.");
3894  SUnit *DefSU = SSD->getSUnit(&Phi);
3895  unsigned DefCycle = cycleScheduled(DefSU);
3896  int DefStage = stageScheduled(DefSU);
3897 
3898  unsigned InitVal = 0;
3899  unsigned LoopVal = 0;
3900  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3901  SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3902  if (!UseSU)
3903  return true;
3904  if (UseSU->getInstr()->isPHI())
3905  return true;
3906  unsigned LoopCycle = cycleScheduled(UseSU);
3907  int LoopStage = stageScheduled(UseSU);
3908  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3909 }
3910 
3911 /// Return true if the instruction is a definition that is loop carried
3912 /// and defines the use on the next iteration.
3913 /// v1 = phi(v2, v3)
3914 /// (Def) v3 = op v1
3915 /// (MO) = v1
3916 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3917 /// register.
3918 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3920  if (!MO.isReg())
3921  return false;
3922  if (Def->isPHI())
3923  return false;
3924  MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3925  if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3926  return false;
3927  if (!isLoopCarried(SSD, *Phi))
3928  return false;
3929  unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3930  for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
3931  MachineOperand &DMO = Def->getOperand(i);
3932  if (!DMO.isReg() || !DMO.isDef())
3933  continue;
3934  if (DMO.getReg() == LoopReg)
3935  return true;
3936  }
3937  return false;
3938 }
3939 
3940 // Check if the generated schedule is valid. This function checks if
3941 // an instruction that uses a physical register is scheduled in a
3942 // different stage than the definition. The pipeliner does not handle
3943 // physical register values that may cross a basic block boundary.
3944 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3945  for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
3946  SUnit &SU = SSD->SUnits[i];
3947  if (!SU.hasPhysRegDefs)
3948  continue;
3949  int StageDef = stageScheduled(&SU);
3950  assert(StageDef != -1 && "Instruction should have been scheduled.");
3951  for (auto &SI : SU.Succs)
3952  if (SI.isAssignedRegDep())
3953  if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
3954  if (stageScheduled(SI.getSUnit()) != StageDef)
3955  return false;
3956  }
3957  return true;
3958 }
3959 
3960 /// A property of the node order in swing-modulo-scheduling is
3961 /// that for nodes outside circuits the following holds:
3962 /// none of them is scheduled after both a successor and a
3963 /// predecessor.
3964 /// The method below checks whether the property is met.
3965 /// If not, debug information is printed and statistics information updated.
3966 /// Note that we do not use an assert statement.
3967 /// The reason is that although an invalid node oder may prevent
3968 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3969 /// it does not lead to the generation of incorrect code.
3970 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3971 
3972  // a sorted vector that maps each SUnit to its index in the NodeOrder
3973  typedef std::pair<SUnit *, unsigned> UnitIndex;
3974  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3975 
3976  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3977  Indices.push_back(std::make_pair(NodeOrder[i], i));
3978 
3979  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3980  return std::get<0>(i1) < std::get<0>(i2);
3981  };
3982 
3983  // sort, so that we can perform a binary search
3984  llvm::sort(Indices.begin(), Indices.end(), CompareKey);
3985 
3986  bool Valid = true;
3987  (void)Valid;
3988  // for each SUnit in the NodeOrder, check whether
3989  // it appears after both a successor and a predecessor
3990  // of the SUnit. If this is the case, and the SUnit
3991  // is not part of circuit, then the NodeOrder is not
3992  // valid.
3993  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3994  SUnit *SU = NodeOrder[i];
3995  unsigned Index = i;
3996 
3997  bool PredBefore = false;
3998  bool SuccBefore = false;
3999 
4000  SUnit *Succ;
4001  SUnit *Pred;
4002  (void)Succ;
4003  (void)Pred;
4004 
4005  for (SDep &PredEdge : SU->Preds) {
4006  SUnit *PredSU = PredEdge.getSUnit();
4007  unsigned PredIndex =
4008  std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
4009  std::make_pair(PredSU, 0), CompareKey));
4010  if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
4011  PredBefore = true;
4012  Pred = PredSU;
4013  break;
4014  }
4015  }
4016 
4017  for (SDep &SuccEdge : SU->Succs) {
4018  SUnit *SuccSU = SuccEdge.getSUnit();
4019  unsigned SuccIndex =
4020  std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
4021  std::make_pair(SuccSU, 0), CompareKey));
4022  if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
4023  SuccBefore = true;
4024  Succ = SuccSU;
4025  break;
4026  }
4027  }
4028 
4029  if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
4030  // instructions in circuits are allowed to be scheduled
4031  // after both a successor and predecessor.
4032  bool InCircuit = std::any_of(
4033  Circuits.begin(), Circuits.end(),
4034  [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
4035  if (InCircuit)
4036  LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
4037  else {
4038  Valid = false;
4039  NumNodeOrderIssues++;
4040  LLVM_DEBUG(dbgs() << "Predecessor ";);
4041  }
4042  LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
4043  << " are scheduled before node " << SU->NodeNum
4044  << "\n";);
4045  }
4046  }
4047 
4048  LLVM_DEBUG({
4049  if (!Valid)
4050  dbgs() << "Invalid node order found!\n";
4051  });
4052 }
4053 
4054 /// Attempt to fix the degenerate cases when the instruction serialization
4055 /// causes the register lifetimes to overlap. For example,
4056 /// p' = store_pi(p, b)
4057 /// = load p, offset
4058 /// In this case p and p' overlap, which means that two registers are needed.
4059 /// Instead, this function changes the load to use p' and updates the offset.
4060 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
4061  unsigned OverlapReg = 0;
4062  unsigned NewBaseReg = 0;
4063  for (SUnit *SU : Instrs) {
4064  MachineInstr *MI = SU->getInstr();
4065  for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
4066  const MachineOperand &MO = MI->getOperand(i);
4067  // Look for an instruction that uses p. The instruction occurs in the
4068  // same cycle but occurs later in the serialized order.
4069  if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
4070  // Check that the instruction appears in the InstrChanges structure,
4071  // which contains instructions that can have the offset updated.
4073  InstrChanges.find(SU);
4074  if (It != InstrChanges.end()) {
4075  unsigned BasePos, OffsetPos;
4076  // Update the base register and adjust the offset.
4077  if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
4078  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
4079  NewMI->getOperand(BasePos).setReg(NewBaseReg);
4080  int64_t NewOffset =
4081  MI->getOperand(OffsetPos).getImm() - It->second.second;
4082  NewMI->getOperand(OffsetPos).setImm(NewOffset);
4083  SU->setInstr(NewMI);
4084  MISUnitMap[NewMI] = SU;
4085  NewMIs.insert(NewMI);
4086  }
4087  }
4088  OverlapReg = 0;
4089  NewBaseReg = 0;
4090  break;
4091  }
4092  // Look for an instruction of the form p' = op(p), which uses and defines
4093  // two virtual registers that get allocated to the same physical register.
4094  unsigned TiedUseIdx = 0;
4095  if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
4096  // OverlapReg is p in the example above.
4097  OverlapReg = MI->getOperand(TiedUseIdx).getReg();
4098  // NewBaseReg is p' in the example above.
4099  NewBaseReg = MI->getOperand(i).getReg();
4100  break;
4101  }
4102  }
4103  }
4104 }
4105 
4106 /// After the schedule has been formed, call this function to combine
4107 /// the instructions from the different stages/cycles. That is, this
4108 /// function creates a schedule that represents a single iteration.
4109 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
4110  // Move all instructions to the first stage from later stages.
4111  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
4112  for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
4113  ++stage) {
4114  std::deque<SUnit *> &cycleInstrs =
4115  ScheduledInstrs[cycle + (stage * InitiationInterval)];
4116  for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
4117  E = cycleInstrs.rend();
4118  I != E; ++I)
4119  ScheduledInstrs[cycle].push_front(*I);
4120  }
4121  }
4122  // Iterate over the definitions in each instruction, and compute the
4123  // stage difference for each use. Keep the maximum value.
4124  for (auto &I : InstrToCycle) {
4125  int DefStage = stageScheduled(I.first);
4126  MachineInstr *MI = I.first->getInstr();
4127  for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
4128  MachineOperand &Op = MI->getOperand(i);
4129  if (!Op.isReg() || !Op.isDef())
4130  continue;
4131 
4132  unsigned Reg = Op.getReg();
4133  unsigned MaxDiff = 0;
4134  bool PhiIsSwapped = false;
4135  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
4136  EI = MRI.use_end();
4137  UI != EI; ++UI) {
4138  MachineOperand &UseOp = *UI;
4139  MachineInstr *UseMI = UseOp.getParent();
4140  SUnit *SUnitUse = SSD->getSUnit(UseMI);
4141  int UseStage = stageScheduled(SUnitUse);
4142  unsigned Diff = 0;
4143  if (UseStage != -1 && UseStage >= DefStage)
4144  Diff = UseStage - DefStage;
4145  if (MI->isPHI()) {
4146  if (isLoopCarried(SSD, *MI))
4147  ++Diff;
4148  else
4149  PhiIsSwapped = true;
4150  }
4151  MaxDiff = std::max(Diff, MaxDiff);
4152  }
4153  RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
4154  }
4155  }
4156 
4157  // Erase all the elements in the later stages. Only one iteration should
4158  // remain in the scheduled list, and it contains all the instructions.
4159  for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
4160  ScheduledInstrs.erase(cycle);
4161 
4162  // Change the registers in instruction as specified in the InstrChanges
4163  // map. We need to use the new registers to create the correct order.
4164  for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
4165  SUnit *SU = &SSD->SUnits[i];
4166  SSD->applyInstrChange(SU->getInstr(), *this);
4167  }
4168 
4169  // Reorder the instructions in each cycle to fix and improve the
4170  // generated code.
4171  for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
4172  std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
4173  std::deque<SUnit *> newOrderPhi;
4174  for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
4175  SUnit *SU = cycleInstrs[i];
4176  if (SU->getInstr()->isPHI())
4177  newOrderPhi.push_back(SU);
4178  }
4179  std::deque<SUnit *> newOrderI;
4180  for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
4181  SUnit *SU = cycleInstrs[i];
4182  if (!SU->getInstr()->isPHI())
4183  orderDependence(SSD, SU, newOrderI);
4184  }
4185  // Replace the old order with the new order.
4186  cycleInstrs.swap(newOrderPhi);
4187  cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
4188  SSD->fixupRegisterOverlaps(cycleInstrs);
4189  }
4190 
4191  LLVM_DEBUG(dump(););
4192 }
4193 
4194 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4195 /// Print the schedule information to the given output.
4196 void SMSchedule::print(raw_ostream &os) const {
4197  // Iterate over each cycle.
4198  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
4199  // Iterate over each instruction in the cycle.
4200  const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
4201  for (SUnit *CI : cycleInstrs->second) {
4202  os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
4203  os << "(" << CI->NodeNum << ") ";
4204  CI->getInstr()->print(os);
4205  os << "\n";
4206  }
4207  }
4208 }
4209 
4210 /// Utility function used for debugging to print the schedule.
4211 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
4212 #endif
uint64_t CallInst * C
std::vector< int >::const_reverse_iterator const_reverse_iterator
Definition: ScheduleDAG.h:754
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:259
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore, cl::desc("Ignore RecMII"))
BitVector & set()
Definition: BitVector.h:398
virtual void finishBlock()
Cleans up after scheduling in the given block.
mop_iterator operands_end()
Definition: MachineInstr.h:454
A common definition of LaneBitmask for use in TableGen and CodeGen.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void clear()
Definition: MapVector.h:89
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
instr_iterator instr_begin()
static bool pred_L(SetVector< SUnit *> &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi...
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:633
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
instr_iterator instr_end()
MachineBasicBlock * getMBB() const
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
bool isAllocatable(unsigned PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn&#39;t been...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:78
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
unsigned getDepth() const
Returns the depth of this node, which is the length of the maximum path up to any node which has no p...
Definition: ScheduleDAG.h:402
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned getReg() const
getReg - Returns the register number.
bool operator>(int64_t V1, const APSInt &V2)
Definition: APSInt.h:327
static void getUnderlyingObjects(MachineInstr *MI, SmallVectorImpl< Value *> &Objs, const DataLayout &DL)
Return the underlying objects for the memory references of an instruction.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
void transferSuccessors(MachineBasicBlock *FromMBB)
Transfers all the successors from MBB to this machine basic block (i.e., copies all the successors Fr...
unsigned getSubReg() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
bool isInlineAsm() const
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
The two locations do not alias at all.
Definition: AliasAnalysis.h:85
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
bool test(unsigned Idx) const
Definition: BitVector.h:502
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:38
static void dump(StringRef Title, SpillInfo const &Spills)
Definition: CoroFrame.cpp:299
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
F(f)
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
block Block Frequency true
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:219
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isPHI() const
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar, MachineInstr &Cmp, SmallVectorImpl< MachineOperand > &Cond, SmallVectorImpl< MachineInstr *> &PrevInsts, unsigned Iter, unsigned MaxIter) const
Generate code to reduce the loop iteration by one and check if the loop is finished.
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
SmallVectorImpl< SDep >::iterator succ_iterator
Definition: ScheduleDAG.h:264
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:260
virtual void adjustSchedDependency(SUnit *def, SUnit *use, SDep &dep) const
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:55
void recede(SmallVectorImpl< RegisterMaskPair > *LiveUses=nullptr)
Recede across the previous instruction.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:191
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
The main low level interface to the alias analysis implementation.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
void eraseFromParent()
This method unlinks &#39;this&#39; from the containing function and deletes it.
A description of a memory reference used in the backend.
static use_iterator use_end()
SmallVectorImpl< SDep >::iterator pred_iterator
Definition: ScheduleDAG.h:263
static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA)
Return true if the instruction causes a chain between memory references before and after it...
void closeBottom()
Set the boundary for the bottom of the region and summarize live outs.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:143
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:412
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:370
SmallVectorImpl< SDep >::const_iterator const_pred_iterator
Definition: ScheduleDAG.h:265
Modulo Software Pipelining
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:54
This file contains the simple types necessary to represent the attributes associated with functions a...
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:284
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
#define LLVM_DUMP_METHOD
Definition: Compiler.h:74
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
void pop_back()
Remove the last element of the SetVector.
Definition: SetVector.h:222
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:83
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
BlockT * getHeader() const
Definition: LoopInfo.h:100
void assign(size_type NumElts, const T &Elt)
Definition: SmallVector.h:423
MachineInstr * getVRegDef(unsigned Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
defusechain_iterator - This class provides iterator support for machine operands in the function that...
PowerPC VSX FMA Mutation
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:56
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:406
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
SlotIndexes pass.
Definition: SlotIndexes.h:331
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool succ_L(SetVector< SUnit *> &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static bool isSubset(S1Ty &Set1, S2Ty &Set2)
Return true if Set1 is a subset of Set2.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Itinerary data supplied by a subtarget to be used by a target.
auto lower_bound(R &&Range, ForwardIt I) -> decltype(adl_begin(Range))
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1138
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual const TargetInstrInfo * getInstrInfo() const
SUnit * getSUnit() const
Definition: ScheduleDAG.h:484
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
iterator find(const KeyT &Key)
Definition: MapVector.h:148
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
virtual bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const
Sometimes, it is possible for the target to tell, even without aliasing information, that two MIs access different memory addresses.
static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst)
Gets latency information for Inst, based on DC information.
bool isDereferenceableInvariantLoad(AliasAnalysis *AA) const
Return true if this load instruction never traps and points to a memory location whose value doesn&#39;t ...
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
#define DEBUG_TYPE
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:348
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:211
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1108
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget...
TargetInstrInfo - Interface to description of machine instruction set.
AliasResult
The possible results of an alias query.
Definition: AliasAnalysis.h:79
const Value * getValue() const
Return the base address of the memory access.
static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming)
Remove the incoming block from the Phis in a basic block.
Scheduling dependency.
Definition: ScheduleDAG.h:50
static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB)
Return the Phi register value that comes from the incoming block.
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:579
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:571
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:820
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:377
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn&#39;t necessary for c...
Definition: ScheduleDAG.h:201
unsigned const MachineRegisterInfo * MRI
bool hasInterval(unsigned Reg) const
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx...
static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB, MachineRegisterInfo &MRI)
Return true if the register has a use that occurs outside the specified loop.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:516
virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI) const
Get the base register and byte offset of an instruction that reads/writes memory. ...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineInstrBuilder & UseMI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE and DBG_LABEL instructions...
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
void setMBB(MachineBasicBlock *MBB)
Represent the analysis usage information of a pass.
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:161
use_instr_iterator use_instr_begin(unsigned RegNo) const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:549
BitVector & reset()
Definition: BitVector.h:439
bool any_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1049
Track the current register pressure at some position in the instruction stream, and remember the high...
void setImm(int64_t immVal)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:181
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an an edge to be removed from the specified node N fr...
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1082
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:194
void DeleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1392
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Any other ordering dependency.
Definition: ScheduleDAG.h:57
size_t size() const
Definition: SmallVector.h:53
INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) INITIALIZE_PASS_END(MachinePipeliner
typename vector_type::const_iterator const_iterator
Definition: SetVector.h:50
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void push_back(bool Val)
Definition: BitVector.h:507
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:972
hexagon gen pred
static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg, MachineBasicBlock *MBB, MachineRegisterInfo &MRI, LiveIntervals &LIS)
Replace all uses of FromReg that appear outside the specified basic block with ToReg.
An unknown scheduling barrier.
Definition: ScheduleDAG.h:70
const unsigned MaxDepth
Representation for a specific memory location.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.
typename vector_type::const_iterator iterator
Definition: SetVector.h:49
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:298
print lazy value Lazy Value Info Printer Pass
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:520
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1023
reverse_instr_iterator instr_rbegin()
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:378
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:534
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
MachineOperand class - Representation of each machine instruction operand.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
MachineInstrBuilder MachineInstrBuilder & DefMI
bool canReserveResources(const MCInstrDesc *MID)
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
int64_t getImm() const
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
reverse_instr_iterator instr_rend()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
std::vector< int >::const_iterator const_iterator
Definition: ScheduleDAG.h:747
void clear()
Completely clear the SetVector.
Definition: SetVector.h:216
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
typename SuperClass::iterator iterator
Definition: SmallVector.h:327
void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New)
Replace successor OLD with NEW and update probability info.
SmallVectorImpl< SDep >::const_iterator const_succ_iterator
Definition: ScheduleDAG.h:266
LiveInterval & createEmptyInterval(unsigned Reg)
Interval creation.
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:212
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:148
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
void initializeMachinePipelinerPass(PassRegistry &)
TargetSubtargetInfo - Generic base class for all target subtargets.
std::set< NodeId > NodeSet
Definition: RDFGraph.h:514
unsigned getHeight() const
Returns the height of this node, which is the length of the maximum path down to any node which has n...
Definition: ScheduleDAG.h:410
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:1963
A ScheduleDAG for scheduling lists of MachineInstr.
Representation of each machine instruction.
Definition: MachineInstr.h:64
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:163
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, MachineInstr *&CmpInst) const
Analyze the loop code, return true if it cannot be understoo.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
iterator begin() const
Definition: SmallPtrSet.h:397
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
use_iterator use_begin(unsigned RegNo) const
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
These values represent a non-pipelined step in the execution of an instruction.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:459
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
This file provides utility analysis objects describing memory locations.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode...
Definition: MCInstrInfo.h:45
void setReg(unsigned Reg)
Change the register this operand corresponds to.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
void push_back(MachineInstr *MI)
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:73
void setSubReg(unsigned subReg)
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don&#39;t consume any machine resources in their current form...
void getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff, RegPressureDelta &Delta, ArrayRef< PressureChange > CriticalPSets, ArrayRef< unsigned > MaxPressureLimit)
Consider the pressure increase caused by traversing this instruction bottom-up.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:490
bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx=nullptr) const
Given the index of a register def operand, check if the register def is tied to a source operand...
iterator end() const
Definition: SmallPtrSet.h:402
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:181
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void init(const MachineFunction *mf, const RegisterClassInfo *rci, const LiveIntervals *lis, const MachineBasicBlock *mbb, MachineBasicBlock::const_iterator pos, bool TrackLaneMasks, bool TrackUntiedDefs)
Setup the RegPressureTracker.
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:268
void GetUnderlyingObjects(Value *V, SmallVectorImpl< Value *> &Objects, const DataLayout &DL, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to GetUnderlyingObject except that it can look through phi and select instruct...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:546
Store the effects of a change in pressure on things that MI scheduler cares about.
iterator end()
Definition: MapVector.h:72
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
void setMemRefs(MachineFunction &MF, ArrayRef< MachineMemOperand *> MemRefs)
Assign this MachineInstr&#39;s memory reference descriptor list.
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
LLVM Value Representation.
Definition: Value.h:73
mop_iterator operands_begin()
Definition: MachineInstr.h:453
A vector that has set insertion semantics.
Definition: SetVector.h:41
static use_instr_iterator use_instr_end()
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:261
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:200
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore...
IRTranslator LLVM IR MI
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
void setPos(MachineBasicBlock::const_iterator Pos)
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1961
unsigned getPSet() const
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned char TargetFlags=0) const
static bool computePath(SUnit *Cur, SetVector< SUnit *> &Path, SetVector< SUnit *> &DestNodes, SetVector< SUnit *> &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
#define LLVM_DEBUG(X)
Definition: Debug.h:123
This class can compute a topological ordering for SUnits and provides methods for dynamically updatin...
Definition: ScheduleDAG.h:693
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
std::vector< MachineBasicBlock * >::iterator succ_iterator
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:246
void reserveResources(const MCInstrDesc *MID)
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:165
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:435