Line data Source code
1 : //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 : //
12 : // Software pipelining (SWP) is an instruction scheduling technique for loops
13 : // that overlap loop iterations and exploits ILP via a compiler transformation.
14 : //
15 : // Swing Modulo Scheduling is an implementation of software pipelining
16 : // that generates schedules that are near optimal in terms of initiation
17 : // interval, register requirements, and stage count. See the papers:
18 : //
19 : // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
20 : // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Proceedings of the 1996
21 : // Conference on Parallel Architectures and Compilation Techiniques.
22 : //
23 : // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
24 : // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
25 : // Transactions on Computers, Vol. 50, No. 3, 2001.
26 : //
27 : // "An Implementation of Swing Modulo Scheduling With Extensions for
28 : // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
29 : // Urbana-Chambpain, 2005.
30 : //
31 : //
32 : // The SMS algorithm consists of three main steps after computing the minimal
33 : // initiation interval (MII).
34 : // 1) Analyze the dependence graph and compute information about each
35 : // instruction in the graph.
36 : // 2) Order the nodes (instructions) by priority based upon the heuristics
37 : // described in the algorithm.
38 : // 3) Attempt to schedule the nodes in the specified order using the MII.
39 : //
40 : // This SMS implementation is a target-independent back-end pass. When enabled,
41 : // the pass runs just prior to the register allocation pass, while the machine
42 : // IR is in SSA form. If software pipelining is successful, then the original
43 : // loop is replaced by the optimized loop. The optimized loop contains one or
44 : // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
45 : // the instructions cannot be scheduled in a given MII, we increase the MII by
46 : // one and try again.
47 : //
48 : // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
49 : // represent loop carried dependences in the DAG as order edges to the Phi
50 : // nodes. We also perform several passes over the DAG to eliminate unnecessary
51 : // edges that inhibit the ability to pipeline. The implementation uses the
52 : // DFAPacketizer class to compute the minimum initiation interval and the check
53 : // where an instruction may be inserted in the pipelined schedule.
54 : //
55 : // In order for the SMS pass to work, several target specific hooks need to be
56 : // implemented to get information about the loop structure and to rewrite
57 : // instructions.
58 : //
59 : //===----------------------------------------------------------------------===//
60 :
61 : #include "llvm/ADT/ArrayRef.h"
62 : #include "llvm/ADT/BitVector.h"
63 : #include "llvm/ADT/DenseMap.h"
64 : #include "llvm/ADT/MapVector.h"
65 : #include "llvm/ADT/PriorityQueue.h"
66 : #include "llvm/ADT/SetVector.h"
67 : #include "llvm/ADT/SmallPtrSet.h"
68 : #include "llvm/ADT/SmallSet.h"
69 : #include "llvm/ADT/SmallVector.h"
70 : #include "llvm/ADT/Statistic.h"
71 : #include "llvm/ADT/iterator_range.h"
72 : #include "llvm/Analysis/AliasAnalysis.h"
73 : #include "llvm/Analysis/MemoryLocation.h"
74 : #include "llvm/Analysis/ValueTracking.h"
75 : #include "llvm/CodeGen/DFAPacketizer.h"
76 : #include "llvm/CodeGen/LiveIntervals.h"
77 : #include "llvm/CodeGen/MachineBasicBlock.h"
78 : #include "llvm/CodeGen/MachineDominators.h"
79 : #include "llvm/CodeGen/MachineFunction.h"
80 : #include "llvm/CodeGen/MachineFunctionPass.h"
81 : #include "llvm/CodeGen/MachineInstr.h"
82 : #include "llvm/CodeGen/MachineInstrBuilder.h"
83 : #include "llvm/CodeGen/MachineLoopInfo.h"
84 : #include "llvm/CodeGen/MachineMemOperand.h"
85 : #include "llvm/CodeGen/MachineOperand.h"
86 : #include "llvm/CodeGen/MachineRegisterInfo.h"
87 : #include "llvm/CodeGen/RegisterClassInfo.h"
88 : #include "llvm/CodeGen/RegisterPressure.h"
89 : #include "llvm/CodeGen/ScheduleDAG.h"
90 : #include "llvm/CodeGen/ScheduleDAGInstrs.h"
91 : #include "llvm/CodeGen/ScheduleDAGMutation.h"
92 : #include "llvm/CodeGen/TargetInstrInfo.h"
93 : #include "llvm/CodeGen/TargetOpcodes.h"
94 : #include "llvm/CodeGen/TargetRegisterInfo.h"
95 : #include "llvm/CodeGen/TargetSubtargetInfo.h"
96 : #include "llvm/Config/llvm-config.h"
97 : #include "llvm/IR/Attributes.h"
98 : #include "llvm/IR/DebugLoc.h"
99 : #include "llvm/IR/Function.h"
100 : #include "llvm/MC/LaneBitmask.h"
101 : #include "llvm/MC/MCInstrDesc.h"
102 : #include "llvm/MC/MCInstrItineraries.h"
103 : #include "llvm/MC/MCRegisterInfo.h"
104 : #include "llvm/Pass.h"
105 : #include "llvm/Support/Casting.h"
106 : #include "llvm/Support/CommandLine.h"
107 : #include "llvm/Support/Compiler.h"
108 : #include "llvm/Support/Debug.h"
109 : #include "llvm/Support/MathExtras.h"
110 : #include "llvm/Support/raw_ostream.h"
111 : #include <algorithm>
112 : #include <cassert>
113 : #include <climits>
114 : #include <cstdint>
115 : #include <deque>
116 : #include <functional>
117 : #include <iterator>
118 : #include <map>
119 : #include <memory>
120 : #include <tuple>
121 : #include <utility>
122 : #include <vector>
123 :
124 : using namespace llvm;
125 :
126 : #define DEBUG_TYPE "pipeliner"
127 :
128 : STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
129 : STATISTIC(NumPipelined, "Number of loops software pipelined");
130 : STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
131 :
132 : /// A command line option to turn software pipelining on or off.
133 : static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
134 : cl::ZeroOrMore,
135 : cl::desc("Enable Software Pipelining"));
136 :
137 : /// A command line option to enable SWP at -Os.
138 : static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
139 : cl::desc("Enable SWP at Os."), cl::Hidden,
140 : cl::init(false));
141 :
142 : /// A command line argument to limit minimum initial interval for pipelining.
143 : static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
144 : cl::desc("Size limit for the MII."),
145 : cl::Hidden, cl::init(27));
146 :
147 : /// A command line argument to limit the number of stages in the pipeline.
148 : static cl::opt<int>
149 : SwpMaxStages("pipeliner-max-stages",
150 : cl::desc("Maximum stages allowed in the generated scheduled."),
151 : cl::Hidden, cl::init(3));
152 :
153 : /// A command line option to disable the pruning of chain dependences due to
154 : /// an unrelated Phi.
155 : static cl::opt<bool>
156 : SwpPruneDeps("pipeliner-prune-deps",
157 : cl::desc("Prune dependences between unrelated Phi nodes."),
158 : cl::Hidden, cl::init(true));
159 :
160 : /// A command line option to disable the pruning of loop carried order
161 : /// dependences.
162 : static cl::opt<bool>
163 : SwpPruneLoopCarried("pipeliner-prune-loop-carried",
164 : cl::desc("Prune loop carried order dependences."),
165 : cl::Hidden, cl::init(true));
166 :
167 : #ifndef NDEBUG
168 : static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
169 : #endif
170 :
171 : static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
172 : cl::ReallyHidden, cl::init(false),
173 : cl::ZeroOrMore, cl::desc("Ignore RecMII"));
174 :
175 : // A command line option to enable the CopyToPhi DAG mutation.
176 : static cl::opt<bool>
177 : SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
178 : cl::init(true), cl::ZeroOrMore,
179 : cl::desc("Enable CopyToPhi DAG Mutation"));
180 :
181 : namespace {
182 :
183 : class NodeSet;
184 : class SMSchedule;
185 :
186 : /// The main class in the implementation of the target independent
187 : /// software pipeliner pass.
188 : class MachinePipeliner : public MachineFunctionPass {
189 : public:
190 : MachineFunction *MF = nullptr;
191 : const MachineLoopInfo *MLI = nullptr;
192 : const MachineDominatorTree *MDT = nullptr;
193 : const InstrItineraryData *InstrItins;
194 : const TargetInstrInfo *TII = nullptr;
195 : RegisterClassInfo RegClassInfo;
196 :
197 : #ifndef NDEBUG
198 : static int NumTries;
199 : #endif
200 :
201 : /// Cache the target analysis information about the loop.
202 843 : struct LoopInfo {
203 : MachineBasicBlock *TBB = nullptr;
204 : MachineBasicBlock *FBB = nullptr;
205 : SmallVector<MachineOperand, 4> BrCond;
206 : MachineInstr *LoopInductionVar = nullptr;
207 : MachineInstr *LoopCompare = nullptr;
208 : };
209 : LoopInfo LI;
210 :
211 : static char ID;
212 :
213 843 : MachinePipeliner() : MachineFunctionPass(ID) {
214 843 : initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
215 843 : }
216 :
217 : bool runOnMachineFunction(MachineFunction &MF) override;
218 :
219 836 : void getAnalysisUsage(AnalysisUsage &AU) const override {
220 : AU.addRequired<AAResultsWrapperPass>();
221 : AU.addPreserved<AAResultsWrapperPass>();
222 : AU.addRequired<MachineLoopInfo>();
223 : AU.addRequired<MachineDominatorTree>();
224 : AU.addRequired<LiveIntervals>();
225 836 : MachineFunctionPass::getAnalysisUsage(AU);
226 836 : }
227 :
228 : private:
229 : void preprocessPhiNodes(MachineBasicBlock &B);
230 : bool canPipelineLoop(MachineLoop &L);
231 : bool scheduleLoop(MachineLoop &L);
232 : bool swingModuloScheduler(MachineLoop &L);
233 : };
234 :
235 : /// This class builds the dependence graph for the instructions in a loop,
236 : /// and attempts to schedule the instructions using the SMS algorithm.
237 : class SwingSchedulerDAG : public ScheduleDAGInstrs {
238 : MachinePipeliner &Pass;
239 : /// The minimum initiation interval between iterations for this schedule.
240 : unsigned MII = 0;
241 : /// Set to true if a valid pipelined schedule is found for the loop.
242 : bool Scheduled = false;
243 : MachineLoop &Loop;
244 : LiveIntervals &LIS;
245 : const RegisterClassInfo &RegClassInfo;
246 :
247 : /// A toplogical ordering of the SUnits, which is needed for changing
248 : /// dependences and iterating over the SUnits.
249 : ScheduleDAGTopologicalSort Topo;
250 :
251 : struct NodeInfo {
252 : int ASAP = 0;
253 : int ALAP = 0;
254 : int ZeroLatencyDepth = 0;
255 : int ZeroLatencyHeight = 0;
256 :
257 : NodeInfo() = default;
258 : };
259 : /// Computed properties for each node in the graph.
260 : std::vector<NodeInfo> ScheduleInfo;
261 :
262 : enum OrderKind { BottomUp = 0, TopDown = 1 };
263 : /// Computed node ordering for scheduling.
264 : SetVector<SUnit *> NodeOrder;
265 :
266 : using NodeSetType = SmallVector<NodeSet, 8>;
267 : using ValueMapTy = DenseMap<unsigned, unsigned>;
268 : using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
269 : using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
270 :
271 : /// Instructions to change when emitting the final schedule.
272 : DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
273 :
274 : /// We may create a new instruction, so remember it because it
275 : /// must be deleted when the pass is finished.
276 : SmallPtrSet<MachineInstr *, 4> NewMIs;
277 :
278 : /// Ordered list of DAG postprocessing steps.
279 : std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
280 :
281 : /// Helper class to implement Johnson's circuit finding algorithm.
282 : class Circuits {
283 : std::vector<SUnit> &SUnits;
284 : SetVector<SUnit *> Stack;
285 : BitVector Blocked;
286 : SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
287 : SmallVector<SmallVector<int, 4>, 16> AdjK;
288 : // Node to Index from ScheduleDAGTopologicalSort
289 : std::vector<int> *Node2Idx;
290 : unsigned NumPaths;
291 : static unsigned MaxPaths;
292 :
293 : public:
294 190 : Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
295 570 : : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
296 380 : Node2Idx = new std::vector<int>(SUs.size());
297 : unsigned Idx = 0;
298 2201 : for (const auto &NodeNum : Topo)
299 2011 : Node2Idx->at(NodeNum) = Idx++;
300 190 : }
301 :
302 380 : ~Circuits() { delete Node2Idx; }
303 :
304 : /// Reset the data structures used in the circuit algorithm.
305 2011 : void reset() {
306 : Stack.clear();
307 : Blocked.reset();
308 6033 : B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
309 2011 : NumPaths = 0;
310 2011 : }
311 :
312 : void createAdjacencyStructure(SwingSchedulerDAG *DAG);
313 : bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
314 : void unblock(int U);
315 : };
316 :
317 190 : struct CopyToPhiMutation : public ScheduleDAGMutation {
318 : void apply(ScheduleDAGInstrs *DAG) override;
319 : };
320 :
321 : public:
322 190 : SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
323 : const RegisterClassInfo &rci)
324 380 : : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
325 380 : RegClassInfo(rci), Topo(SUnits, &ExitSU) {
326 190 : P.MF->getSubtarget().getSMSMutations(Mutations);
327 190 : if (SwpEnableCopyToPhi)
328 190 : Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
329 190 : }
330 :
331 : void schedule() override;
332 : void finishBlock() override;
333 :
334 : /// Return true if the loop kernel has been scheduled.
335 0 : bool hasNewSchedule() { return Scheduled; }
336 :
337 : /// Return the earliest time an instruction may be scheduled.
338 10092 : int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
339 :
340 : /// Return the latest time an instruction my be scheduled.
341 8948 : int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
342 :
343 : /// The mobility function, which the number of slots in which
344 : /// an instruction may be scheduled.
345 4582 : int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
346 :
347 : /// The depth, in the dependence graph, for a node.
348 0 : unsigned getDepth(SUnit *Node) { return Node->getDepth(); }
349 :
350 : /// The maximum unweighted length of a path from an arbitrary node to the
351 : /// given node in which each edge has latency 0
352 0 : int getZeroLatencyDepth(SUnit *Node) {
353 8440 : return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
354 : }
355 :
356 : /// The height, in the dependence graph, for a node.
357 0 : unsigned getHeight(SUnit *Node) { return Node->getHeight(); }
358 :
359 : /// The maximum unweighted length of a path from the given node to an
360 : /// arbitrary node in which each edge has latency 0
361 0 : int getZeroLatencyHeight(SUnit *Node) {
362 5940 : return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
363 : }
364 :
365 : /// Return true if the dependence is a back-edge in the data dependence graph.
366 : /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
367 : /// using an anti dependence from a Phi to an instruction.
368 0 : bool isBackedge(SUnit *Source, const SDep &Dep) {
369 0 : if (Dep.getKind() != SDep::Anti)
370 0 : return false;
371 0 : return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
372 : }
373 :
374 : bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
375 :
376 : /// The distance function, which indicates that operation V of iteration I
377 : /// depends on operations U of iteration I-distance.
378 0 : unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
379 : // Instructions that feed a Phi have a distance of 1. Computing larger
380 : // values for arrays requires data dependence information.
381 436 : if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
382 0 : return 1;
383 : return 0;
384 : }
385 :
386 : /// Set the Minimum Initiation Interval for this schedule attempt.
387 : void setMII(unsigned mii) { MII = mii; }
388 :
389 : void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
390 :
391 : void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
392 :
393 : /// Return the new base register that was stored away for the changed
394 : /// instruction.
395 : unsigned getInstrBaseReg(SUnit *SU) {
396 : DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
397 434 : InstrChanges.find(SU);
398 434 : if (It != InstrChanges.end())
399 53 : return It->second.first;
400 : return 0;
401 : }
402 :
403 : void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
404 : Mutations.push_back(std::move(Mutation));
405 : }
406 :
407 : static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
408 :
409 : private:
410 : void addLoopCarriedDependences(AliasAnalysis *AA);
411 : void updatePhiDependences();
412 : void changeDependences();
413 : unsigned calculateResMII();
414 : unsigned calculateRecMII(NodeSetType &RecNodeSets);
415 : void findCircuits(NodeSetType &NodeSets);
416 : void fuseRecs(NodeSetType &NodeSets);
417 : void removeDuplicateNodes(NodeSetType &NodeSets);
418 : void computeNodeFunctions(NodeSetType &NodeSets);
419 : void registerPressureFilter(NodeSetType &NodeSets);
420 : void colocateNodeSets(NodeSetType &NodeSets);
421 : void checkNodeSets(NodeSetType &NodeSets);
422 : void groupRemainingNodes(NodeSetType &NodeSets);
423 : void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
424 : SetVector<SUnit *> &NodesAdded);
425 : void computeNodeOrder(NodeSetType &NodeSets);
426 : void checkValidNodeOrder(const NodeSetType &Circuits) const;
427 : bool schedulePipeline(SMSchedule &Schedule);
428 : void generatePipelinedLoop(SMSchedule &Schedule);
429 : void generateProlog(SMSchedule &Schedule, unsigned LastStage,
430 : MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
431 : MBBVectorTy &PrologBBs);
432 : void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
433 : MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
434 : MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
435 : void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
436 : MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
437 : SMSchedule &Schedule, ValueMapTy *VRMap,
438 : InstrMapTy &InstrMap, unsigned LastStageNum,
439 : unsigned CurStageNum, bool IsLast);
440 : void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
441 : MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
442 : SMSchedule &Schedule, ValueMapTy *VRMap,
443 : InstrMapTy &InstrMap, unsigned LastStageNum,
444 : unsigned CurStageNum, bool IsLast);
445 : void removeDeadInstructions(MachineBasicBlock *KernelBB,
446 : MBBVectorTy &EpilogBBs);
447 : void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
448 : SMSchedule &Schedule);
449 : void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
450 : MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
451 : ValueMapTy *VRMap);
452 : bool computeDelta(MachineInstr &MI, unsigned &Delta);
453 : void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
454 : unsigned Num);
455 : MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
456 : unsigned InstStageNum);
457 : MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
458 : unsigned InstStageNum,
459 : SMSchedule &Schedule);
460 : void updateInstruction(MachineInstr *NewMI, bool LastDef,
461 : unsigned CurStageNum, unsigned InstrStageNum,
462 : SMSchedule &Schedule, ValueMapTy *VRMap);
463 : MachineInstr *findDefInLoop(unsigned Reg);
464 : unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
465 : unsigned LoopStage, ValueMapTy *VRMap,
466 : MachineBasicBlock *BB);
467 : void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
468 : SMSchedule &Schedule, ValueMapTy *VRMap,
469 : InstrMapTy &InstrMap);
470 : void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
471 : InstrMapTy &InstrMap, unsigned CurStageNum,
472 : unsigned PhiNum, MachineInstr *Phi,
473 : unsigned OldReg, unsigned NewReg,
474 : unsigned PrevReg = 0);
475 : bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
476 : unsigned &OffsetPos, unsigned &NewBase,
477 : int64_t &NewOffset);
478 : void postprocessDAG();
479 : };
480 :
481 : /// A NodeSet contains a set of SUnit DAG nodes with additional information
482 : /// that assigns a priority to the set.
483 3324 : class NodeSet {
484 : SetVector<SUnit *> Nodes;
485 : bool HasRecurrence = false;
486 : unsigned RecMII = 0;
487 : int MaxMOV = 0;
488 : unsigned MaxDepth = 0;
489 : unsigned Colocate = 0;
490 : SUnit *ExceedPressure = nullptr;
491 : unsigned Latency = 0;
492 :
493 : public:
494 : using iterator = SetVector<SUnit *>::const_iterator;
495 :
496 188 : NodeSet() = default;
497 578 : NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
498 : Latency = 0;
499 2492 : for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
500 6963 : for (const SDep &Succ : Nodes[i]->Succs)
501 3135 : if (Nodes.count(Succ.getSUnit()))
502 1946 : Latency += Succ.getLatency();
503 578 : }
504 :
505 545 : bool insert(SUnit *SU) { return Nodes.insert(SU); }
506 :
507 83 : void insert(iterator S, iterator E) { Nodes.insert(S, E); }
508 :
509 : template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
510 0 : return Nodes.remove_if(P);
511 : }
512 :
513 : unsigned count(SUnit *SU) const { return Nodes.count(SU); }
514 :
515 : bool hasRecurrence() { return HasRecurrence; };
516 :
517 420 : unsigned size() const { return Nodes.size(); }
518 :
519 : bool empty() const { return Nodes.empty(); }
520 :
521 1 : SUnit *getNode(unsigned i) const { return Nodes[i]; };
522 :
523 0 : void setRecMII(unsigned mii) { RecMII = mii; };
524 :
525 0 : void setColocate(unsigned c) { Colocate = c; };
526 :
527 1 : void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
528 :
529 0 : bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
530 :
531 0 : int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
532 :
533 0 : int getRecMII() { return RecMII; }
534 :
535 : /// Summarize node functions for the entire node set.
536 420 : void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
537 1656 : for (SUnit *SU : *this) {
538 1519 : MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
539 1890 : MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
540 : }
541 420 : }
542 :
543 0 : unsigned getLatency() { return Latency; }
544 :
545 0 : unsigned getMaxDepth() { return MaxDepth; }
546 :
547 : void clear() {
548 : Nodes.clear();
549 197 : RecMII = 0;
550 197 : HasRecurrence = false;
551 197 : MaxMOV = 0;
552 197 : MaxDepth = 0;
553 197 : Colocate = 0;
554 197 : ExceedPressure = nullptr;
555 : }
556 :
557 420 : operator SetVector<SUnit *> &() { return Nodes; }
558 :
559 : /// Sort the node sets by importance. First, rank them by recurrence MII,
560 : /// then by mobility (least mobile done first), and finally by depth.
561 : /// Each node set may contain a colocate value which is used as the first
562 : /// tie breaker, if it's set.
563 : bool operator>(const NodeSet &RHS) const {
564 0 : if (RecMII == RHS.RecMII) {
565 0 : if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
566 0 : return Colocate < RHS.Colocate;
567 0 : if (MaxMOV == RHS.MaxMOV)
568 0 : return MaxDepth > RHS.MaxDepth;
569 0 : return MaxMOV < RHS.MaxMOV;
570 : }
571 0 : return RecMII > RHS.RecMII;
572 : }
573 :
574 : bool operator==(const NodeSet &RHS) const {
575 : return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
576 : MaxDepth == RHS.MaxDepth;
577 : }
578 :
579 : bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
580 :
581 : iterator begin() { return Nodes.begin(); }
582 : iterator end() { return Nodes.end(); }
583 :
584 : void print(raw_ostream &os) const {
585 : os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
586 : << " depth " << MaxDepth << " col " << Colocate << "\n";
587 : for (const auto &I : Nodes)
588 : os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
589 : os << "\n";
590 : }
591 :
592 : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
593 : LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
594 : #endif
595 : };
596 :
597 : /// This class represents the scheduled code. The main data structure is a
598 : /// map from scheduled cycle to instructions. During scheduling, the
599 : /// data structure explicitly represents all stages/iterations. When
600 : /// the algorithm finshes, the schedule is collapsed into a single stage,
601 : /// which represents instructions from different loop iterations.
602 : ///
603 : /// The SMS algorithm allows negative values for cycles, so the first cycle
604 : /// in the schedule is the smallest cycle value.
605 : class SMSchedule {
606 : private:
607 : /// Map from execution cycle to instructions.
608 : DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
609 :
610 : /// Map from instruction to execution cycle.
611 : std::map<SUnit *, int> InstrToCycle;
612 :
613 : /// Map for each register and the max difference between its uses and def.
614 : /// The first element in the pair is the max difference in stages. The
615 : /// second is true if the register defines a Phi value and loop value is
616 : /// scheduled before the Phi.
617 : std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
618 :
619 : /// Keep track of the first cycle value in the schedule. It starts
620 : /// as zero, but the algorithm allows negative values.
621 : int FirstCycle = 0;
622 :
623 : /// Keep track of the last cycle value in the schedule.
624 : int LastCycle = 0;
625 :
626 : /// The initiation interval (II) for the schedule.
627 : int InitiationInterval = 0;
628 :
629 : /// Target machine information.
630 : const TargetSubtargetInfo &ST;
631 :
632 : /// Virtual register information.
633 : MachineRegisterInfo &MRI;
634 :
635 : std::unique_ptr<DFAPacketizer> Resources;
636 :
637 : public:
638 188 : SMSchedule(MachineFunction *mf)
639 564 : : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
640 188 : Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
641 :
642 277 : void reset() {
643 277 : ScheduledInstrs.clear();
644 : InstrToCycle.clear();
645 : RegToStageDiff.clear();
646 277 : FirstCycle = 0;
647 277 : LastCycle = 0;
648 277 : InitiationInterval = 0;
649 277 : }
650 :
651 : /// Set the initiation interval for this schedule.
652 272 : void setInitiationInterval(int ii) { InitiationInterval = ii; }
653 :
654 : /// Return the first cycle in the completed schedule. This
655 : /// can be a negative value.
656 0 : int getFirstCycle() const { return FirstCycle; }
657 :
658 : /// Return the last cycle in the finalized schedule.
659 1150 : int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
660 :
661 : /// Return the cycle of the earliest scheduled instruction in the dependence
662 : /// chain.
663 : int earliestCycleInChain(const SDep &Dep);
664 :
665 : /// Return the cycle of the latest scheduled instruction in the dependence
666 : /// chain.
667 : int latestCycleInChain(const SDep &Dep);
668 :
669 : void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
670 : int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
671 : bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
672 :
673 : /// Iterators for the cycle to instruction map.
674 : using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
675 : using const_sched_iterator =
676 : DenseMap<int, std::deque<SUnit *>>::const_iterator;
677 :
678 : /// Return true if the instruction is scheduled at the specified stage.
679 4788 : bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
680 4788 : return (stageScheduled(SU) == (int)StageNum);
681 : }
682 :
683 : /// Return the stage for a scheduled instruction. Return -1 if
684 : /// the instruction has not been scheduled.
685 : int stageScheduled(SUnit *SU) const {
686 : std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
687 29593 : if (it == InstrToCycle.end())
688 : return -1;
689 28080 : return (it->second - FirstCycle) / InitiationInterval;
690 : }
691 :
692 : /// Return the cycle for a scheduled instruction. This function normalizes
693 : /// the first cycle to be 0.
694 : unsigned cycleScheduled(SUnit *SU) const {
695 : std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
696 : assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
697 5955 : return (it->second - FirstCycle) % InitiationInterval;
698 : }
699 :
700 : /// Return the maximum stage count needed for this schedule.
701 : unsigned getMaxStageCount() {
702 500 : return (LastCycle - FirstCycle) / InitiationInterval;
703 : }
704 :
705 : /// Return the max. number of stages/iterations that can occur between a
706 : /// register definition and its uses.
707 3078 : unsigned getStagesForReg(int Reg, unsigned CurStage) {
708 3078 : std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
709 3078 : if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
710 6 : return 1;
711 : return Stages.first;
712 : }
713 :
714 : /// The number of stages for a Phi is a little different than other
715 : /// instructions. The minimum value computed in RegToStageDiff is 1
716 : /// because we assume the Phi is needed for at least 1 iteration.
717 : /// This is not the case if the loop value is scheduled prior to the
718 : /// Phi in the same stage. This function returns the number of stages
719 : /// or iterations needed between the Phi definition and any uses.
720 : unsigned getStagesForPhi(int Reg) {
721 864 : std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
722 432 : if (Stages.second)
723 : return Stages.first;
724 399 : return Stages.first - 1;
725 : }
726 :
727 : /// Return the instructions that are scheduled at the specified cycle.
728 : std::deque<SUnit *> &getInstructions(int cycle) {
729 24141 : return ScheduledInstrs[cycle];
730 : }
731 :
732 : bool isValidSchedule(SwingSchedulerDAG *SSD);
733 : void finalizeSchedule(SwingSchedulerDAG *SSD);
734 : void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
735 : std::deque<SUnit *> &Insts);
736 : bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
737 : bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
738 : MachineOperand &MO);
739 : void print(raw_ostream &os) const;
740 : void dump() const;
741 : };
742 :
743 : } // end anonymous namespace
744 :
745 : unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
746 : char MachinePipeliner::ID = 0;
747 : #ifndef NDEBUG
748 : int MachinePipeliner::NumTries = 0;
749 : #endif
750 : char &llvm::MachinePipelinerID = MachinePipeliner::ID;
751 :
752 31780 : INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
753 : "Modulo Software Pipelining", false, false)
754 31780 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
755 31780 : INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
756 31780 : INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
757 31780 : INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
758 85990 : INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
759 : "Modulo Software Pipelining", false, false)
760 :
761 : /// The "main" function for implementing Swing Modulo Scheduling.
762 3294 : bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
763 3294 : if (skipFunction(mf.getFunction()))
764 : return false;
765 :
766 3284 : if (!EnableSWP)
767 : return false;
768 :
769 3275 : if (mf.getFunction().getAttributes().hasAttribute(
770 3275 : AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
771 33 : !EnableSWPOptSize.getPosition())
772 31 : return false;
773 :
774 3244 : MF = &mf;
775 3244 : MLI = &getAnalysis<MachineLoopInfo>();
776 3244 : MDT = &getAnalysis<MachineDominatorTree>();
777 3244 : TII = MF->getSubtarget().getInstrInfo();
778 3244 : RegClassInfo.runOnMachineFunction(*MF);
779 :
780 3601 : for (auto &L : *MLI)
781 357 : scheduleLoop(*L);
782 :
783 : return false;
784 : }
785 :
786 : /// Attempt to perform the SMS algorithm on the specified loop. This function is
787 : /// the main entry point for the algorithm. The function identifies candidate
788 : /// loops, calculates the minimum initiation interval, and attempts to schedule
789 : /// the loop.
790 395 : bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
791 : bool Changed = false;
792 433 : for (auto &InnerLoop : L)
793 38 : Changed |= scheduleLoop(*InnerLoop);
794 :
795 : #ifndef NDEBUG
796 : // Stop trying after reaching the limit (if any).
797 : int Limit = SwpLoopLimit;
798 : if (Limit >= 0) {
799 : if (NumTries >= SwpLoopLimit)
800 : return Changed;
801 : NumTries++;
802 : }
803 : #endif
804 :
805 395 : if (!canPipelineLoop(L))
806 : return Changed;
807 :
808 : ++NumTrytoPipeline;
809 :
810 190 : Changed = swingModuloScheduler(L);
811 :
812 190 : return Changed;
813 : }
814 :
815 : /// Return true if the loop can be software pipelined. The algorithm is
816 : /// restricted to loops with a single basic block. Make sure that the
817 : /// branch in the loop can be analyzed.
818 395 : bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
819 395 : if (L.getNumBlocks() != 1)
820 : return false;
821 :
822 : // Check if the branch can't be understood because we can't do pipelining
823 : // if that's the case.
824 333 : LI.TBB = nullptr;
825 333 : LI.FBB = nullptr;
826 : LI.BrCond.clear();
827 666 : if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
828 : return false;
829 :
830 333 : LI.LoopInductionVar = nullptr;
831 333 : LI.LoopCompare = nullptr;
832 333 : if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
833 : return false;
834 :
835 191 : if (!L.getLoopPreheader())
836 : return false;
837 :
838 : // Remove any subregisters from inputs to phi nodes.
839 190 : preprocessPhiNodes(*L.getHeader());
840 190 : return true;
841 : }
842 :
843 190 : void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
844 190 : MachineRegisterInfo &MRI = MF->getRegInfo();
845 190 : SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
846 :
847 803 : for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
848 423 : MachineOperand &DefOp = PI.getOperand(0);
849 : assert(DefOp.getSubReg() == 0);
850 423 : auto *RC = MRI.getRegClass(DefOp.getReg());
851 :
852 1269 : for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
853 846 : MachineOperand &RegOp = PI.getOperand(i);
854 846 : if (RegOp.getSubReg() == 0)
855 833 : continue;
856 :
857 : // If the operand uses a subregister, replace it with a new register
858 : // without subregisters, and generate a copy to the new register.
859 13 : unsigned NewReg = MRI.createVirtualRegister(RC);
860 13 : MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
861 13 : MachineBasicBlock::iterator At = PredB.getFirstTerminator();
862 : const DebugLoc &DL = PredB.findDebugLoc(At);
863 26 : auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
864 : .addReg(RegOp.getReg(), getRegState(RegOp),
865 13 : RegOp.getSubReg());
866 13 : Slots.insertMachineInstrInMaps(*Copy);
867 13 : RegOp.setReg(NewReg);
868 : RegOp.setSubReg(0);
869 : }
870 : }
871 190 : }
872 :
873 : /// The SMS algorithm consists of the following main steps:
874 : /// 1. Computation and analysis of the dependence graph.
875 : /// 2. Ordering of the nodes (instructions).
876 : /// 3. Attempt to Schedule the loop.
877 190 : bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
878 : assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
879 :
880 190 : SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
881 :
882 : MachineBasicBlock *MBB = L.getHeader();
883 : // The kernel should not include any terminator instructions. These
884 : // will be added back later.
885 190 : SMS.startBlock(MBB);
886 :
887 : // Compute the number of 'real' instructions in the basic block by
888 : // ignoring terminators.
889 : unsigned size = MBB->size();
890 569 : for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
891 : E = MBB->instr_end();
892 569 : I != E; ++I, --size)
893 : ;
894 :
895 190 : SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
896 190 : SMS.schedule();
897 190 : SMS.exitRegion();
898 :
899 190 : SMS.finishBlock();
900 190 : return SMS.hasNewSchedule();
901 : }
902 :
903 : /// We override the schedule function in ScheduleDAGInstrs to implement the
904 : /// scheduling part of the Swing Modulo Scheduling algorithm.
905 190 : void SwingSchedulerDAG::schedule() {
906 190 : AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
907 190 : buildSchedGraph(AA);
908 190 : addLoopCarriedDependences(AA);
909 190 : updatePhiDependences();
910 190 : Topo.InitDAGTopologicalSorting();
911 190 : changeDependences();
912 : postprocessDAG();
913 : LLVM_DEBUG(dump());
914 :
915 125 : NodeSetType NodeSets;
916 190 : findCircuits(NodeSets);
917 125 : NodeSetType Circuits = NodeSets;
918 :
919 : // Calculate the MII.
920 190 : unsigned ResMII = calculateResMII();
921 190 : unsigned RecMII = calculateRecMII(NodeSets);
922 :
923 190 : fuseRecs(NodeSets);
924 :
925 : // This flag is used for testing and can cause correctness problems.
926 190 : if (SwpIgnoreRecMII)
927 1 : RecMII = 0;
928 :
929 190 : MII = std::max(ResMII, RecMII);
930 : LLVM_DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII
931 : << ", res=" << ResMII << ")\n");
932 :
933 : // Can't schedule a loop without a valid MII.
934 190 : if (MII == 0)
935 65 : return;
936 :
937 : // Don't pipeline large loops.
938 190 : if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
939 : return;
940 :
941 188 : computeNodeFunctions(NodeSets);
942 :
943 188 : registerPressureFilter(NodeSets);
944 :
945 188 : colocateNodeSets(NodeSets);
946 :
947 188 : checkNodeSets(NodeSets);
948 :
949 : LLVM_DEBUG({
950 : for (auto &I : NodeSets) {
951 : dbgs() << " Rec NodeSet ";
952 : I.dump();
953 : }
954 : });
955 :
956 : std::stable_sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
957 :
958 188 : groupRemainingNodes(NodeSets);
959 :
960 188 : removeDuplicateNodes(NodeSets);
961 :
962 : LLVM_DEBUG({
963 : for (auto &I : NodeSets) {
964 : dbgs() << " NodeSet ";
965 : I.dump();
966 : }
967 : });
968 :
969 188 : computeNodeOrder(NodeSets);
970 :
971 : // check for node order issues
972 188 : checkValidNodeOrder(Circuits);
973 :
974 313 : SMSchedule Schedule(Pass.MF);
975 188 : Scheduled = schedulePipeline(Schedule);
976 :
977 188 : if (!Scheduled)
978 63 : return;
979 :
980 : unsigned numStages = Schedule.getMaxStageCount();
981 : // No need to generate pipeline if there are no overlapped iterations.
982 125 : if (numStages == 0)
983 : return;
984 :
985 : // Check that the maximum stage count is less than user-defined limit.
986 125 : if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
987 : return;
988 :
989 125 : generatePipelinedLoop(Schedule);
990 : ++NumPipelined;
991 : }
992 :
993 : /// Clean up after the software pipeliner runs.
994 190 : void SwingSchedulerDAG::finishBlock() {
995 211 : for (MachineInstr *I : NewMIs)
996 21 : MF.DeleteMachineInstr(I);
997 190 : NewMIs.clear();
998 :
999 : // Call the superclass.
1000 190 : ScheduleDAGInstrs::finishBlock();
1001 190 : }
1002 :
1003 : /// Return the register values for the operands of a Phi instruction.
1004 : /// This function assume the instruction is a Phi.
1005 : static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
1006 : unsigned &InitVal, unsigned &LoopVal) {
1007 : assert(Phi.isPHI() && "Expecting a Phi.");
1008 :
1009 : InitVal = 0;
1010 : LoopVal = 0;
1011 8049 : for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
1012 10732 : if (Phi.getOperand(i + 1).getMBB() != Loop)
1013 1048 : InitVal = Phi.getOperand(i).getReg();
1014 : else
1015 2683 : LoopVal = Phi.getOperand(i).getReg();
1016 :
1017 : assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
1018 : }
1019 :
1020 : /// Return the Phi register value that comes from the incoming block.
1021 : static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
1022 342 : for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
1023 684 : if (Phi.getOperand(i + 1).getMBB() != LoopBB)
1024 342 : return Phi.getOperand(i).getReg();
1025 : return 0;
1026 : }
1027 :
1028 : /// Return the Phi register value that comes the loop block.
1029 : static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
1030 931 : for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
1031 1750 : if (Phi.getOperand(i + 1).getMBB() == LoopBB)
1032 382 : return Phi.getOperand(i).getReg();
1033 : return 0;
1034 : }
1035 :
1036 : /// Return true if SUb can be reached from SUa following the chain edges.
1037 364 : static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
1038 : SmallPtrSet<SUnit *, 8> Visited;
1039 : SmallVector<SUnit *, 8> Worklist;
1040 364 : Worklist.push_back(SUa);
1041 530 : while (!Worklist.empty()) {
1042 : const SUnit *SU = Worklist.pop_back_val();
1043 1118 : for (auto &SI : SU->Succs) {
1044 952 : SUnit *SuccSU = SI.getSUnit();
1045 952 : if (SI.getKind() == SDep::Order) {
1046 543 : if (Visited.count(SuccSU))
1047 0 : continue;
1048 543 : if (SuccSU == SUb)
1049 275 : return true;
1050 268 : Worklist.push_back(SuccSU);
1051 268 : Visited.insert(SuccSU);
1052 : }
1053 : }
1054 : }
1055 : return false;
1056 : }
1057 :
1058 : /// Return true if the instruction causes a chain between memory
1059 : /// references before and after it.
1060 2011 : static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
1061 4020 : return MI.isCall() || MI.hasUnmodeledSideEffects() ||
1062 2010 : (MI.hasOrderedMemoryRef() &&
1063 2 : (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
1064 : }
1065 :
1066 : /// Return the underlying objects for the memory references of an instruction.
1067 : /// This function calls the code in ValueTracking, but first checks that the
1068 : /// instruction has a memory operand.
1069 484 : static void getUnderlyingObjects(MachineInstr *MI,
1070 : SmallVectorImpl<Value *> &Objs,
1071 : const DataLayout &DL) {
1072 484 : if (!MI->hasOneMemOperand())
1073 : return;
1074 446 : MachineMemOperand *MM = *MI->memoperands_begin();
1075 446 : if (!MM->getValue())
1076 : return;
1077 446 : GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
1078 598 : for (Value *V : Objs) {
1079 446 : if (!isIdentifiedObject(V)) {
1080 : Objs.clear();
1081 294 : return;
1082 : }
1083 152 : Objs.push_back(V);
1084 : }
1085 : }
1086 :
1087 : /// Add a chain edge between a load and store if the store can be an
1088 : /// alias of the load on a subsequent iteration, i.e., a loop carried
1089 : /// dependence. This code is very similar to the code in ScheduleDAGInstrs
1090 : /// but that code doesn't create loop carried dependences.
1091 190 : void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
1092 : MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
1093 : Value *UnknownValue =
1094 190 : UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
1095 2201 : for (auto &SU : SUnits) {
1096 2011 : MachineInstr &MI = *SU.getInstr();
1097 2011 : if (isDependenceBarrier(MI, AA))
1098 : PendingLoads.clear();
1099 2008 : else if (MI.mayLoad()) {
1100 : SmallVector<Value *, 4> Objs;
1101 325 : getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1102 325 : if (Objs.empty())
1103 226 : Objs.push_back(UnknownValue);
1104 749 : for (auto V : Objs) {
1105 424 : SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
1106 424 : SUs.push_back(&SU);
1107 : }
1108 1683 : } else if (MI.mayStore()) {
1109 : SmallVector<Value *, 4> Objs;
1110 159 : getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1111 159 : if (Objs.empty())
1112 106 : Objs.push_back(UnknownValue);
1113 371 : for (auto V : Objs) {
1114 : MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
1115 212 : PendingLoads.find(V);
1116 212 : if (I == PendingLoads.end())
1117 : continue;
1118 476 : for (auto Load : I->second) {
1119 364 : if (isSuccOrder(Load, &SU))
1120 299 : continue;
1121 89 : MachineInstr &LdMI = *Load->getInstr();
1122 : // First, perform the cheaper check that compares the base register.
1123 : // If they are the same and the load offset is less than the store
1124 : // offset, then mark the dependence as loop carried potentially.
1125 : unsigned BaseReg1, BaseReg2;
1126 : int64_t Offset1, Offset2;
1127 177 : if (TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) &&
1128 88 : TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
1129 86 : if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
1130 : assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1131 : "What happened to the chain edge?");
1132 : SDep Dep(Load, SDep::Barrier);
1133 : Dep.setLatency(1);
1134 24 : SU.addPred(Dep);
1135 : continue;
1136 : }
1137 : }
1138 : // Second, the more expensive check that uses alias analysis on the
1139 : // base registers. If they alias, and the load offset is less than
1140 : // the store offset, the mark the dependence as loop carried.
1141 65 : if (!AA) {
1142 : SDep Dep(Load, SDep::Barrier);
1143 : Dep.setLatency(1);
1144 0 : SU.addPred(Dep);
1145 : continue;
1146 : }
1147 65 : MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1148 65 : MachineMemOperand *MMO2 = *MI.memoperands_begin();
1149 130 : if (!MMO1->getValue() || !MMO2->getValue()) {
1150 : SDep Dep(Load, SDep::Barrier);
1151 : Dep.setLatency(1);
1152 0 : SU.addPred(Dep);
1153 : continue;
1154 : }
1155 65 : if (MMO1->getValue() == MMO2->getValue() &&
1156 20 : MMO1->getOffset() <= MMO2->getOffset()) {
1157 : SDep Dep(Load, SDep::Barrier);
1158 : Dep.setLatency(1);
1159 0 : SU.addPred(Dep);
1160 : continue;
1161 : }
1162 65 : AliasResult AAResult = AA->alias(
1163 65 : MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
1164 : MMO1->getAAInfo()),
1165 65 : MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
1166 : MMO2->getAAInfo()));
1167 :
1168 65 : if (AAResult != NoAlias) {
1169 : SDep Dep(Load, SDep::Barrier);
1170 : Dep.setLatency(1);
1171 20 : SU.addPred(Dep);
1172 : }
1173 : }
1174 : }
1175 : }
1176 : }
1177 190 : }
1178 :
1179 : /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1180 : /// processes dependences for PHIs. This function adds true dependences
1181 : /// from a PHI to a use, and a loop carried dependence from the use to the
1182 : /// PHI. The loop carried dependence is represented as an anti dependence
1183 : /// edge. This function also removes chain dependences between unrelated
1184 : /// PHIs.
1185 190 : void SwingSchedulerDAG::updatePhiDependences() {
1186 : SmallVector<SDep, 4> RemoveDeps;
1187 190 : const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1188 :
1189 : // Iterate over each DAG node.
1190 2201 : for (SUnit &I : SUnits) {
1191 : RemoveDeps.clear();
1192 : // Set to true if the instruction has an operand defined by a Phi.
1193 : unsigned HasPhiUse = 0;
1194 : unsigned HasPhiDef = 0;
1195 2011 : MachineInstr *MI = I.getInstr();
1196 : // Iterate over each operand, and we process the definitions.
1197 7591 : for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1198 2011 : MOE = MI->operands_end();
1199 9602 : MOI != MOE; ++MOI) {
1200 7591 : if (!MOI->isReg())
1201 : continue;
1202 5668 : unsigned Reg = MOI->getReg();
1203 5668 : if (MOI->isDef()) {
1204 : // If the register is used by a Phi, then create an anti dependence.
1205 2738 : for (MachineRegisterInfo::use_instr_iterator
1206 2014 : UI = MRI.use_instr_begin(Reg),
1207 : UE = MRI.use_instr_end();
1208 7490 : UI != UE; ++UI) {
1209 : MachineInstr *UseMI = &*UI;
1210 : SUnit *SU = getSUnit(UseMI);
1211 2665 : if (SU != nullptr && UseMI->isPHI()) {
1212 : if (!MI->isPHI()) {
1213 : SDep Dep(SU, SDep::Anti, Reg);
1214 : Dep.setLatency(1);
1215 394 : I.addPred(Dep);
1216 : } else {
1217 : HasPhiDef = Reg;
1218 : // Add a chain edge to a dependent Phi that isn't an existing
1219 : // predecessor.
1220 40 : if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1221 14 : I.addPred(SDep(SU, SDep::Barrier));
1222 : }
1223 : }
1224 : }
1225 : } else if (MOI->isUse()) {
1226 : // If the register is defined by a Phi, then create a true dependence.
1227 3654 : MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1228 3654 : if (DefMI == nullptr)
1229 : continue;
1230 : SUnit *SU = getSUnit(DefMI);
1231 2688 : if (SU != nullptr && DefMI->isPHI()) {
1232 : if (!MI->isPHI()) {
1233 : SDep Dep(SU, SDep::Data, Reg);
1234 : Dep.setLatency(0);
1235 757 : ST.adjustSchedDependency(SU, &I, Dep);
1236 757 : I.addPred(Dep);
1237 : } else {
1238 : HasPhiUse = Reg;
1239 : // Add a chain edge to a dependent Phi that isn't an existing
1240 : // predecessor.
1241 38 : if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1242 0 : I.addPred(SDep(SU, SDep::Barrier));
1243 : }
1244 : }
1245 : }
1246 : }
1247 : // Remove order dependences from an unrelated Phi.
1248 2011 : if (!SwpPruneDeps)
1249 : continue;
1250 4897 : for (auto &PI : I.Preds) {
1251 2886 : MachineInstr *PMI = PI.getSUnit()->getInstr();
1252 1176 : if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1253 14 : if (I.getInstr()->isPHI()) {
1254 14 : if (PMI->getOperand(0).getReg() == HasPhiUse)
1255 : continue;
1256 28 : if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1257 : continue;
1258 : }
1259 0 : RemoveDeps.push_back(PI);
1260 : }
1261 : }
1262 2011 : for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1263 0 : I.removePred(RemoveDeps[i]);
1264 : }
1265 190 : }
1266 :
1267 : /// Iterate over each DAG node and see if we can change any dependences
1268 : /// in order to reduce the recurrence MII.
1269 190 : void SwingSchedulerDAG::changeDependences() {
1270 : // See if an instruction can use a value from the previous iteration.
1271 : // If so, we update the base and offset of the instruction and change
1272 : // the dependences.
1273 2201 : for (SUnit &I : SUnits) {
1274 2011 : unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1275 2011 : int64_t NewOffset = 0;
1276 2011 : if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1277 : NewOffset))
1278 1986 : continue;
1279 :
1280 : // Get the MI and SUnit for the instruction that defines the original base.
1281 25 : unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1282 25 : MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1283 25 : if (!DefMI)
1284 : continue;
1285 : SUnit *DefSU = getSUnit(DefMI);
1286 25 : if (!DefSU)
1287 0 : continue;
1288 : // Get the MI and SUnit for the instruction that defins the new base.
1289 25 : MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1290 25 : if (!LastMI)
1291 : continue;
1292 : SUnit *LastSU = getSUnit(LastMI);
1293 25 : if (!LastSU)
1294 0 : continue;
1295 :
1296 25 : if (Topo.IsReachable(&I, LastSU))
1297 : continue;
1298 :
1299 : // Remove the dependence. The value now depends on a prior iteration.
1300 : SmallVector<SDep, 4> Deps;
1301 50 : for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1302 : ++P)
1303 25 : if (P->getSUnit() == DefSU)
1304 25 : Deps.push_back(*P);
1305 50 : for (int i = 0, e = Deps.size(); i != e; i++) {
1306 50 : Topo.RemovePred(&I, Deps[i].getSUnit());
1307 25 : I.removePred(Deps[i]);
1308 : }
1309 : // Remove the chain dependence between the instructions.
1310 : Deps.clear();
1311 123 : for (auto &P : LastSU->Preds)
1312 98 : if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1313 22 : Deps.push_back(P);
1314 47 : for (int i = 0, e = Deps.size(); i != e; i++) {
1315 44 : Topo.RemovePred(LastSU, Deps[i].getSUnit());
1316 22 : LastSU->removePred(Deps[i]);
1317 : }
1318 :
1319 : // Add a dependence between the new instruction and the instruction
1320 : // that defines the new base.
1321 25 : SDep Dep(&I, SDep::Anti, NewBase);
1322 25 : Topo.AddPred(LastSU, &I);
1323 25 : LastSU->addPred(Dep);
1324 :
1325 : // Remember the base and offset information so that we can update the
1326 : // instruction during code generation.
1327 25 : InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1328 : }
1329 190 : }
1330 :
1331 : namespace {
1332 :
1333 : // FuncUnitSorter - Comparison operator used to sort instructions by
1334 : // the number of functional unit choices.
1335 12744 : struct FuncUnitSorter {
1336 : const InstrItineraryData *InstrItins;
1337 : DenseMap<unsigned, unsigned> Resources;
1338 :
1339 190 : FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1340 :
1341 : // Compute the number of functional unit alternatives needed
1342 : // at each stage, and take the minimum value. We prioritize the
1343 : // instructions by the least number of choices first.
1344 0 : unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1345 5895 : unsigned schedClass = Inst->getDesc().getSchedClass();
1346 : unsigned min = UINT_MAX;
1347 17794 : for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1348 : *IE = InstrItins->endStage(schedClass);
1349 29584 : IS != IE; ++IS) {
1350 17794 : unsigned funcUnits = IS->getUnits();
1351 : unsigned numAlternatives = countPopulation(funcUnits);
1352 17794 : if (numAlternatives < min) {
1353 : min = numAlternatives;
1354 13113 : F = funcUnits;
1355 : }
1356 : }
1357 0 : return min;
1358 : }
1359 :
1360 : // Compute the critical resources needed by the instruction. This
1361 : // function records the functional units needed by instructions that
1362 : // must use only one functional unit. We use this as a tie breaker
1363 : // for computing the resource MII. The instrutions that require
1364 : // the same, highly used, functional unit have high priority.
1365 1589 : void calcCriticalResources(MachineInstr &MI) {
1366 1589 : unsigned SchedClass = MI.getDesc().getSchedClass();
1367 3945 : for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1368 : *IE = InstrItins->endStage(SchedClass);
1369 3945 : IS != IE; ++IS) {
1370 2356 : unsigned FuncUnits = IS->getUnits();
1371 2356 : if (countPopulation(FuncUnits) == 1)
1372 674 : Resources[FuncUnits]++;
1373 : }
1374 1589 : }
1375 :
1376 : /// Return true if IS1 has less priority than IS2.
1377 5895 : bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1378 5895 : unsigned F1 = 0, F2 = 0;
1379 5895 : unsigned MFUs1 = minFuncUnits(IS1, F1);
1380 : unsigned MFUs2 = minFuncUnits(IS2, F2);
1381 5895 : if (MFUs1 == 1 && MFUs2 == 1)
1382 2232 : return Resources.lookup(F1) < Resources.lookup(F2);
1383 4779 : return MFUs1 > MFUs2;
1384 : }
1385 : };
1386 :
1387 : } // end anonymous namespace
1388 :
1389 : /// Calculate the resource constrained minimum initiation interval for the
1390 : /// specified loop. We use the DFA to model the resources needed for
1391 : /// each instruction, and we ignore dependences. A different DFA is created
1392 : /// for each cycle that is required. When adding a new instruction, we attempt
1393 : /// to add it to each existing DFA, until a legal space is found. If the
1394 : /// instruction cannot be reserved in an existing DFA, we create a new one.
1395 190 : unsigned SwingSchedulerDAG::calculateResMII() {
1396 : SmallVector<DFAPacketizer *, 8> Resources;
1397 190 : MachineBasicBlock *MBB = Loop.getHeader();
1398 190 : Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1399 :
1400 : // Sort the instructions by the number of available choices for scheduling,
1401 : // least to most. Use the number of critical resources as the tie breaker.
1402 : FuncUnitSorter FUS =
1403 190 : FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1404 190 : for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1405 190 : E = MBB->getFirstTerminator();
1406 1779 : I != E; ++I)
1407 1589 : FUS.calcCriticalResources(*I);
1408 : PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1409 190 : FuncUnitOrder(FUS);
1410 :
1411 190 : for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1412 190 : E = MBB->getFirstTerminator();
1413 1779 : I != E; ++I)
1414 1589 : FuncUnitOrder.push(&*I);
1415 :
1416 1779 : while (!FuncUnitOrder.empty()) {
1417 1589 : MachineInstr *MI = FuncUnitOrder.top();
1418 1589 : FuncUnitOrder.pop();
1419 3178 : if (TII->isZeroCost(MI->getOpcode()))
1420 : continue;
1421 : // Attempt to reserve the instruction in an existing DFA. At least one
1422 : // DFA is needed for each cycle.
1423 1507 : unsigned NumCycles = getSUnit(MI)->Latency;
1424 : unsigned ReservedCycles = 0;
1425 : SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
1426 : SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
1427 3014 : for (unsigned C = 0; C < NumCycles; ++C)
1428 6050 : while (RI != RE) {
1429 5715 : if ((*RI++)->canReserveResources(*MI)) {
1430 1172 : ++ReservedCycles;
1431 1172 : break;
1432 : }
1433 : }
1434 : // Start reserving resources using existing DFAs.
1435 2679 : for (unsigned C = 0; C < ReservedCycles; ++C) {
1436 1172 : --RI;
1437 1172 : (*RI)->reserveResources(*MI);
1438 : }
1439 : // Add new DFAs, if needed, to reserve resources.
1440 1842 : for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1441 : DFAPacketizer *NewResource =
1442 335 : TII->CreateTargetScheduleState(MF.getSubtarget());
1443 : assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1444 335 : NewResource->reserveResources(*MI);
1445 335 : Resources.push_back(NewResource);
1446 : }
1447 : }
1448 190 : int Resmii = Resources.size();
1449 : // Delete the memory for each of the DFAs that were created earlier.
1450 715 : for (DFAPacketizer *RI : Resources) {
1451 : DFAPacketizer *D = RI;
1452 1050 : delete D;
1453 : }
1454 : Resources.clear();
1455 190 : return Resmii;
1456 : }
1457 :
1458 : /// Calculate the recurrence-constrainted minimum initiation interval.
1459 : /// Iterate over each circuit. Compute the delay(c) and distance(c)
1460 : /// for each circuit. The II needs to satisfy the inequality
1461 : /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1462 : /// II that satisfies the inequality, and the RecMII is the maximum
1463 : /// of those values.
1464 0 : unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1465 : unsigned RecMII = 0;
1466 :
1467 768 : for (NodeSet &Nodes : NodeSets) {
1468 578 : if (Nodes.empty())
1469 0 : continue;
1470 :
1471 578 : unsigned Delay = Nodes.getLatency();
1472 : unsigned Distance = 1;
1473 :
1474 : // ii = ceil(delay / distance)
1475 : unsigned CurMII = (Delay + Distance - 1) / Distance;
1476 : Nodes.setRecMII(CurMII);
1477 578 : if (CurMII > RecMII)
1478 : RecMII = CurMII;
1479 : }
1480 :
1481 0 : return RecMII;
1482 : }
1483 :
1484 : /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1485 : /// but we do this to find the circuits, and then change them back.
1486 380 : static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1487 : SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1488 4782 : for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1489 4022 : SUnit *SU = &SUnits[i];
1490 5736 : for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1491 9758 : IP != EP; ++IP) {
1492 5736 : if (IP->getKind() != SDep::Anti)
1493 : continue;
1494 842 : DepsAdded.push_back(std::make_pair(SU, *IP));
1495 : }
1496 : }
1497 842 : for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1498 : E = DepsAdded.end();
1499 1222 : I != E; ++I) {
1500 : // Remove this anti dependency and add one in the reverse direction.
1501 842 : SUnit *SU = I->first;
1502 842 : SDep &D = I->second;
1503 : SUnit *TargetSU = D.getSUnit();
1504 842 : unsigned Reg = D.getReg();
1505 842 : unsigned Lat = D.getLatency();
1506 842 : SU->removePred(D);
1507 : SDep Dep(SU, SDep::Anti, Reg);
1508 : Dep.setLatency(Lat);
1509 842 : TargetSU->addPred(Dep);
1510 : }
1511 380 : }
1512 :
1513 : /// Create the adjacency structure of the nodes in the graph.
1514 190 : void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1515 : SwingSchedulerDAG *DAG) {
1516 380 : BitVector Added(SUnits.size());
1517 : DenseMap<int, int> OutputDeps;
1518 2391 : for (int i = 0, e = SUnits.size(); i != e; ++i) {
1519 : Added.reset();
1520 : // Add any successor to the adjacency matrix and exclude duplicates.
1521 4879 : for (auto &SI : SUnits[i].Succs) {
1522 : // Only create a back-edge on the first and last nodes of a dependence
1523 : // chain. This records any chains and adds them later.
1524 2868 : if (SI.getKind() == SDep::Output) {
1525 0 : int N = SI.getSUnit()->NodeNum;
1526 0 : int BackEdge = i;
1527 0 : auto Dep = OutputDeps.find(BackEdge);
1528 0 : if (Dep != OutputDeps.end()) {
1529 0 : BackEdge = Dep->second;
1530 : OutputDeps.erase(Dep);
1531 : }
1532 0 : OutputDeps[N] = BackEdge;
1533 : }
1534 : // Do not process a boundary node and a back-edge is processed only
1535 : // if it goes to a Phi.
1536 2868 : if (SI.getSUnit()->isBoundaryNode() ||
1537 421 : (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1538 27 : continue;
1539 2841 : int N = SI.getSUnit()->NodeNum;
1540 2841 : if (!Added.test(N)) {
1541 2826 : AdjK[i].push_back(N);
1542 2826 : Added.set(N);
1543 : }
1544 : }
1545 : // A chain edge between a store and a load is treated as a back-edge in the
1546 : // adjacency matrix.
1547 4879 : for (auto &PI : SUnits[i].Preds) {
1548 6246 : if (!SUnits[i].getInstr()->mayStore() ||
1549 1020 : !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1550 2750 : continue;
1551 118 : if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1552 117 : int N = PI.getSUnit()->NodeNum;
1553 117 : if (!Added.test(N)) {
1554 117 : AdjK[i].push_back(N);
1555 117 : Added.set(N);
1556 : }
1557 : }
1558 : }
1559 : }
1560 : // Add back-eges in the adjacency matrix for the output dependences.
1561 190 : for (auto &OD : OutputDeps)
1562 0 : if (!Added.test(OD.second)) {
1563 0 : AdjK[OD.first].push_back(OD.second);
1564 0 : Added.set(OD.second);
1565 : }
1566 190 : }
1567 :
1568 : /// Identify an elementary circuit in the dependence graph starting at the
1569 : /// specified node.
1570 16475 : bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1571 : bool HasBackedge) {
1572 16475 : SUnit *SV = &SUnits[V];
1573 : bool F = false;
1574 16475 : Stack.insert(SV);
1575 16475 : Blocked.set(V);
1576 :
1577 39508 : for (auto W : AdjK[V]) {
1578 23927 : if (NumPaths > MaxPaths)
1579 : break;
1580 23788 : if (W < S)
1581 : continue;
1582 20524 : if (W == S) {
1583 755 : if (!HasBackedge)
1584 1734 : NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1585 : F = true;
1586 755 : ++NumPaths;
1587 : break;
1588 39538 : } else if (!Blocked.test(W)) {
1589 14464 : if (circuit(W, S, NodeSets,
1590 14464 : Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1591 : F = true;
1592 : }
1593 : }
1594 :
1595 15720 : if (F)
1596 2418 : unblock(V);
1597 : else {
1598 33142 : for (auto W : AdjK[V]) {
1599 19085 : if (W < S)
1600 : continue;
1601 32306 : if (B[W].count(SV) == 0)
1602 15629 : B[W].insert(SV);
1603 : }
1604 : }
1605 : Stack.pop_back();
1606 16475 : return F;
1607 : }
1608 :
1609 : /// Unblock a node in the circuit finding algorithm.
1610 5468 : void SwingSchedulerDAG::Circuits::unblock(int U) {
1611 5468 : Blocked.reset(U);
1612 5468 : SmallPtrSet<SUnit *, 4> &BU = B[U];
1613 10230 : while (!BU.empty()) {
1614 4762 : SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1615 : assert(SI != BU.end() && "Invalid B set.");
1616 4762 : SUnit *W = *SI;
1617 : BU.erase(W);
1618 9524 : if (Blocked.test(W->NodeNum))
1619 3050 : unblock(W->NodeNum);
1620 : }
1621 5468 : }
1622 :
1623 : /// Identify all the elementary circuits in the dependence graph using
1624 : /// Johnson's circuit algorithm.
1625 190 : void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1626 : // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1627 : // but we do this to find the circuits, and then change them back.
1628 190 : swapAntiDependences(SUnits);
1629 :
1630 380 : Circuits Cir(SUnits, Topo);
1631 : // Create the adjacency structure.
1632 190 : Cir.createAdjacencyStructure(this);
1633 2391 : for (int i = 0, e = SUnits.size(); i != e; ++i) {
1634 2011 : Cir.reset();
1635 2011 : Cir.circuit(i, i, NodeSets);
1636 : }
1637 :
1638 : // Change the dependences back so that we've created a DAG again.
1639 190 : swapAntiDependences(SUnits);
1640 190 : }
1641 :
1642 : // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1643 : // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1644 : // additional copies that are needed across iterations. An artificial dependence
1645 : // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1646 :
1647 : // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1648 : // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1649 : // PHI-------True-Dep------> USEOfPhi
1650 :
1651 : // The mutation creates
1652 : // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1653 :
1654 : // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1655 : // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1656 : // late to avoid additional copies across iterations. The possible scheduling
1657 : // order would be
1658 : // USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1659 :
1660 190 : void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1661 2201 : for (SUnit &SU : DAG->SUnits) {
1662 : // Find the COPY/REG_SEQUENCE instruction.
1663 4022 : if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1664 2004 : continue;
1665 :
1666 : // Record the loop carried PHIs.
1667 : SmallVector<SUnit *, 4> PHISUs;
1668 : // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1669 : SmallVector<SUnit *, 4> SrcSUs;
1670 :
1671 228 : for (auto &Dep : SU.Preds) {
1672 148 : SUnit *TmpSU = Dep.getSUnit();
1673 148 : MachineInstr *TmpMI = TmpSU->getInstr();
1674 : SDep::Kind DepKind = Dep.getKind();
1675 : // Save the loop carried PHI.
1676 148 : if (DepKind == SDep::Anti && TmpMI->isPHI())
1677 8 : PHISUs.push_back(TmpSU);
1678 : // Save the source of COPY/REG_SEQUENCE.
1679 : // If the source has no pre-decessors, we will end up creating cycles.
1680 258 : else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1681 115 : SrcSUs.push_back(TmpSU);
1682 : }
1683 :
1684 80 : if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1685 : continue;
1686 :
1687 : // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1688 : // SUnit to the container.
1689 : SmallVector<SUnit *, 8> UseSUs;
1690 18 : for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
1691 29 : for (auto &Dep : (*I)->Succs) {
1692 18 : if (Dep.getKind() != SDep::Data)
1693 11 : continue;
1694 :
1695 11 : SUnit *TmpSU = Dep.getSUnit();
1696 11 : MachineInstr *TmpMI = TmpSU->getInstr();
1697 11 : if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1698 4 : PHISUs.push_back(TmpSU);
1699 4 : continue;
1700 : }
1701 7 : UseSUs.push_back(TmpSU);
1702 : }
1703 : }
1704 :
1705 14 : if (UseSUs.size() == 0)
1706 : continue;
1707 :
1708 : SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1709 : // Add the artificial dependencies if it does not form a cycle.
1710 14 : for (auto I : UseSUs) {
1711 14 : for (auto Src : SrcSUs) {
1712 7 : if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1713 4 : Src->addPred(SDep(I, SDep::Artificial));
1714 4 : SDAG->Topo.AddPred(Src, I);
1715 : }
1716 : }
1717 : }
1718 : }
1719 190 : }
1720 :
1721 : /// Return true for DAG nodes that we ignore when computing the cost functions.
1722 : /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1723 : /// in the calculation of the ASAP, ALAP, etc functions.
1724 : static bool ignoreDependence(const SDep &D, bool isPred) {
1725 : if (D.isArtificial())
1726 : return true;
1727 13613 : return D.getKind() == SDep::Anti && isPred;
1728 : }
1729 :
1730 : /// Compute several functions need to order the nodes for scheduling.
1731 : /// ASAP - Earliest time to schedule a node.
1732 : /// ALAP - Latest time to schedule a node.
1733 : /// MOV - Mobility function, difference between ALAP and ASAP.
1734 : /// D - Depth of each node.
1735 : /// H - Height of each node.
1736 188 : void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1737 376 : ScheduleInfo.resize(SUnits.size());
1738 :
1739 : LLVM_DEBUG({
1740 : for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1741 : E = Topo.end();
1742 : I != E; ++I) {
1743 : const SUnit &SU = SUnits[*I];
1744 : dumpNode(SU);
1745 : }
1746 : });
1747 :
1748 188 : int maxASAP = 0;
1749 : // Compute ASAP and ZeroLatencyDepth.
1750 : for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1751 : E = Topo.end();
1752 2033 : I != E; ++I) {
1753 1845 : int asap = 0;
1754 1845 : int zeroLatencyDepth = 0;
1755 1845 : SUnit *SU = &SUnits[*I];
1756 2603 : for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1757 : EP = SU->Preds.end();
1758 4448 : IP != EP; ++IP) {
1759 : SUnit *pred = IP->getSUnit();
1760 2603 : if (IP->getLatency() == 0)
1761 1100 : zeroLatencyDepth =
1762 3032 : std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1763 : if (ignoreDependence(*IP, true))
1764 : continue;
1765 5328 : asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1766 : getDistance(pred, SU, *IP) * MII));
1767 : }
1768 1845 : maxASAP = std::max(maxASAP, asap);
1769 1845 : ScheduleInfo[*I].ASAP = asap;
1770 3690 : ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1771 : }
1772 :
1773 : // Compute ALAP, ZeroLatencyHeight, and MOV.
1774 : for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1775 : E = Topo.rend();
1776 2033 : I != E; ++I) {
1777 1845 : int alap = maxASAP;
1778 1845 : int zeroLatencyHeight = 0;
1779 1845 : SUnit *SU = &SUnits[*I];
1780 2603 : for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1781 : ES = SU->Succs.end();
1782 4448 : IS != ES; ++IS) {
1783 : SUnit *succ = IS->getSUnit();
1784 2603 : if (IS->getLatency() == 0)
1785 1100 : zeroLatencyHeight =
1786 2952 : std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1787 : if (ignoreDependence(*IS, true))
1788 : continue;
1789 5738 : alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1790 : getDistance(SU, succ, *IS) * MII));
1791 : }
1792 :
1793 1845 : ScheduleInfo[*I].ALAP = alap;
1794 3690 : ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1795 : }
1796 :
1797 : // After computing the node functions, compute the summary for each node set.
1798 608 : for (NodeSet &I : NodeSets)
1799 420 : I.computeNodeSetInfo(this);
1800 :
1801 : LLVM_DEBUG({
1802 : for (unsigned i = 0; i < SUnits.size(); i++) {
1803 : dbgs() << "\tNode " << i << ":\n";
1804 : dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1805 : dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1806 : dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1807 : dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1808 : dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1809 : dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1810 : dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1811 : }
1812 : });
1813 188 : }
1814 :
1815 : /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1816 : /// as the predecessors of the elements of NodeOrder that are not also in
1817 : /// NodeOrder.
1818 849 : static bool pred_L(SetVector<SUnit *> &NodeOrder,
1819 : SmallSetVector<SUnit *, 8> &Preds,
1820 : const NodeSet *S = nullptr) {
1821 849 : Preds.clear();
1822 : for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1823 7472 : I != E; ++I) {
1824 17998 : for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1825 17998 : PI != PE; ++PI) {
1826 11375 : if (S && S->count(PI->getSUnit()) == 0)
1827 2948 : continue;
1828 : if (ignoreDependence(*PI, true))
1829 : continue;
1830 7095 : if (NodeOrder.count(PI->getSUnit()) == 0)
1831 1018 : Preds.insert(PI->getSUnit());
1832 : }
1833 : // Back-edges are predecessors with an anti-dependence.
1834 16597 : for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1835 : ES = (*I)->Succs.end();
1836 16597 : IS != ES; ++IS) {
1837 9974 : if (IS->getKind() != SDep::Anti)
1838 : continue;
1839 1313 : if (S && S->count(IS->getSUnit()) == 0)
1840 304 : continue;
1841 1009 : if (NodeOrder.count(IS->getSUnit()) == 0)
1842 0 : Preds.insert(IS->getSUnit());
1843 : }
1844 : }
1845 849 : return !Preds.empty();
1846 : }
1847 :
1848 : /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1849 : /// as the successors of the elements of NodeOrder that are not also in
1850 : /// NodeOrder.
1851 2476 : static bool succ_L(SetVector<SUnit *> &NodeOrder,
1852 : SmallSetVector<SUnit *, 8> &Succs,
1853 : const NodeSet *S = nullptr) {
1854 2476 : Succs.clear();
1855 : for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1856 14473 : I != E; ++I) {
1857 30113 : for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1858 30113 : SI != SE; ++SI) {
1859 18116 : if (S && S->count(SI->getSUnit()) == 0)
1860 2946 : continue;
1861 : if (ignoreDependence(*SI, false))
1862 : continue;
1863 15128 : if (NodeOrder.count(SI->getSUnit()) == 0)
1864 1933 : Succs.insert(SI->getSUnit());
1865 : }
1866 33558 : for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1867 : PE = (*I)->Preds.end();
1868 33558 : PI != PE; ++PI) {
1869 21561 : if (PI->getKind() != SDep::Anti)
1870 : continue;
1871 3741 : if (S && S->count(PI->getSUnit()) == 0)
1872 468 : continue;
1873 3273 : if (NodeOrder.count(PI->getSUnit()) == 0)
1874 887 : Succs.insert(PI->getSUnit());
1875 : }
1876 : }
1877 2476 : return !Succs.empty();
1878 : }
1879 :
1880 : /// Return true if there is a path from the specified node to any of the nodes
1881 : /// in DestNodes. Keep track and return the nodes in any path.
1882 5219 : static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1883 : SetVector<SUnit *> &DestNodes,
1884 : SetVector<SUnit *> &Exclude,
1885 : SmallPtrSet<SUnit *, 8> &Visited) {
1886 5219 : if (Cur->isBoundaryNode())
1887 : return false;
1888 5219 : if (Exclude.count(Cur) != 0)
1889 : return false;
1890 8898 : if (DestNodes.count(Cur) != 0)
1891 : return true;
1892 3951 : if (!Visited.insert(Cur).second)
1893 1816 : return Path.count(Cur) != 0;
1894 : bool FoundPath = false;
1895 6937 : for (auto &SI : Cur->Succs)
1896 3894 : FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1897 7901 : for (auto &PI : Cur->Preds)
1898 4858 : if (PI.getKind() == SDep::Anti)
1899 291 : FoundPath |=
1900 291 : computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1901 3043 : if (FoundPath)
1902 693 : Path.insert(Cur);
1903 : return FoundPath;
1904 : }
1905 :
1906 : /// Return true if Set1 is a subset of Set2.
1907 571 : template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1908 1034 : for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1909 1004 : if (Set2.count(*I) == 0)
1910 192 : return false;
1911 : return true;
1912 : }
1913 391 :
1914 787 : /// Compute the live-out registers for the instructions in a node-set.
1915 588 : /// The live-out registers are those that are defined in the node-set,
1916 192 : /// but not used. Except for use operands of Phis.
1917 : static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1918 : NodeSet &NS) {
1919 180 : const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1920 247 : MachineRegisterInfo &MRI = MF.getRegInfo();
1921 416 : SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1922 : SmallSet<unsigned, 4> Uses;
1923 : for (SUnit *SU : NS) {
1924 : const MachineInstr *MI = SU->getInstr();
1925 : if (MI->isPHI())
1926 : continue;
1927 : for (const MachineOperand &MO : MI->operands())
1928 : if (MO.isReg() && MO.isUse()) {
1929 83 : unsigned Reg = MO.getReg();
1930 : if (TargetRegisterInfo::isVirtualRegister(Reg))
1931 83 : Uses.insert(Reg);
1932 83 : else if (MRI.isAllocatable(Reg))
1933 : for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1934 83 : Uses.insert(*Units);
1935 645 : }
1936 562 : }
1937 : for (SUnit *SU : NS)
1938 : for (const MachineOperand &MO : SU->getInstr()->operands())
1939 2519 : if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1940 1976 : unsigned Reg = MO.getReg();
1941 1131 : if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1942 1131 : if (!Uses.count(Reg))
1943 1129 : LiveOutRegs.push_back(RegisterMaskPair(Reg,
1944 2 : LaneBitmask::getNone()));
1945 0 : } else if (MRI.isAllocatable(Reg)) {
1946 0 : for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1947 : if (!Uses.count(*Units))
1948 : LiveOutRegs.push_back(RegisterMaskPair(*Units,
1949 645 : LaneBitmask::getNone()));
1950 2633 : }
1951 2071 : }
1952 542 : RPTracker.addLiveRegs(LiveOutRegs);
1953 542 : }
1954 542 :
1955 95 : /// A heuristic to filter nodes in recurrent node-sets if the register
1956 : /// pressure of a set is too high.
1957 0 : void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1958 0 : for (auto &NS : NodeSets) {
1959 0 : // Skip small node-sets since they won't cause register pressure problems.
1960 0 : if (NS.size() <= 2)
1961 : continue;
1962 : IntervalPressure RecRegPressure;
1963 : RegPressureTracker RecRPTracker(RecRegPressure);
1964 83 : RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1965 83 : computeLiveOuts(MF, RecRPTracker, NS);
1966 : RecRPTracker.closeBottom();
1967 :
1968 : std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1969 188 : llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1970 608 : return A->NodeNum > B->NodeNum;
1971 : });
1972 420 :
1973 337 : for (auto &SU : SUnits) {
1974 : // Since we're computing the register pressure for a subset of the
1975 83 : // instructions in a block, we need to set the tracker for each
1976 166 : // instruction in the node-set. The tracker is set to the instruction
1977 83 : // just after the one we're interested in.
1978 83 : MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1979 : RecRPTracker.setPos(std::next(CurInstI));
1980 :
1981 : RegPressureDelta RPDelta;
1982 0 : ArrayRef<PressureChange> CriticalPSets;
1983 : RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1984 : CriticalPSets,
1985 639 : RecRegPressure.MaxSetPressure);
1986 : if (RPDelta.Excess.isValid()) {
1987 : LLVM_DEBUG(
1988 : dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1989 : << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1990 557 : << ":" << RPDelta.Excess.getUnitInc());
1991 557 : NS.setExceedPressure(SU);
1992 : break;
1993 557 : }
1994 557 : RecRPTracker.recede();
1995 557 : }
1996 : }
1997 : }
1998 557 :
1999 : /// A heuristic to colocate node sets that have the same set of
2000 : /// successors.
2001 : void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2002 : unsigned Colocate = 0;
2003 1 : for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2004 1 : NodeSet &N1 = NodeSets[i];
2005 : SmallSetVector<SUnit *, 8> S1;
2006 556 : if (N1.empty() || !succ_L(N1, S1))
2007 : continue;
2008 : for (int j = i + 1; j < e; ++j) {
2009 188 : NodeSet &N2 = NodeSets[j];
2010 : if (N1.compareRecMII(N2) != 0)
2011 : continue;
2012 : SmallSetVector<SUnit *, 8> S2;
2013 0 : if (N2.empty() || !succ_L(N2, S2))
2014 : continue;
2015 0 : if (isSubset(S1, S2) && S1.size() == S2.size()) {
2016 0 : N1.setColocate(++Colocate);
2017 : N2.setColocate(Colocate);
2018 0 : break;
2019 : }
2020 0 : }
2021 0 : }
2022 0 : }
2023 0 :
2024 : /// Check if the existing node-sets are profitable. If not, then ignore the
2025 0 : /// recurrent node-sets, and attempt to schedule all nodes together. This is
2026 : /// a heuristic. If the MII is large and all the recurrent node-sets are small,
2027 0 : /// then it's best to try to schedule all instructions together instead of
2028 0 : /// starting with the recurrent node-sets.
2029 : void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2030 : // Look for loops with a large MII.
2031 : if (MII < 17)
2032 : return;
2033 : // Check if the node-set contains only a simple add recurrence.
2034 0 : for (auto &NS : NodeSets) {
2035 : if (NS.getRecMII() > 2)
2036 : return;
2037 : if (NS.getMaxDepth() > MII)
2038 : return;
2039 : }
2040 : NodeSets.clear();
2041 0 : LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2042 : return;
2043 0 : }
2044 0 :
2045 : /// Add the nodes that do not belong to a recurrence set into groups
2046 0 : /// based upon connected componenets.
2047 0 : void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2048 0 : SetVector<SUnit *> NodesAdded;
2049 0 : SmallPtrSet<SUnit *, 8> Visited;
2050 0 : // Add the nodes that are on a path between the previous node sets and
2051 : // the current node set.
2052 : for (NodeSet &I : NodeSets) {
2053 : SmallSetVector<SUnit *, 8> N;
2054 : // Add the nodes from the current node set to the previous node set.
2055 : if (succ_L(I, N)) {
2056 : SetVector<SUnit *> Path;
2057 : for (SUnit *NI : N) {
2058 : Visited.clear();
2059 188 : computePath(NI, Path, NodesAdded, I, Visited);
2060 188 : }
2061 : if (!Path.empty())
2062 : I.insert(Path.begin(), Path.end());
2063 : }
2064 608 : // Add the nodes from the previous node set to the current node set.
2065 : N.clear();
2066 : if (succ_L(NodesAdded, N)) {
2067 420 : SetVector<SUnit *> Path;
2068 315 : for (SUnit *NI : N) {
2069 922 : Visited.clear();
2070 607 : computePath(NI, Path, I, NodesAdded, Visited);
2071 607 : }
2072 : if (!Path.empty())
2073 315 : I.insert(Path.begin(), Path.end());
2074 53 : }
2075 : NodesAdded.insert(I.begin(), I.end());
2076 : }
2077 :
2078 420 : // Create a new node set with the connected nodes of any successor of a node
2079 165 : // in a recurrent set.
2080 592 : NodeSet NewSet;
2081 427 : SmallSetVector<SUnit *, 8> N;
2082 427 : if (succ_L(NodesAdded, N))
2083 : for (SUnit *I : N)
2084 165 : addConnectedNodes(I, NewSet, NodesAdded);
2085 30 : if (!NewSet.empty())
2086 : NodeSets.push_back(NewSet);
2087 420 :
2088 : // Create a new node set with the connected nodes of any predecessor of a node
2089 : // in a recurrent set.
2090 : NewSet.clear();
2091 : if (pred_L(NodesAdded, N))
2092 : for (SUnit *I : N)
2093 : addConnectedNodes(I, NewSet, NodesAdded);
2094 188 : if (!NewSet.empty())
2095 218 : NodeSets.push_back(NewSet);
2096 141 :
2097 188 : // Create new nodes sets with the connected nodes any remaining node that
2098 77 : // has no predecessor.
2099 : for (unsigned i = 0; i < SUnits.size(); ++i) {
2100 : SUnit *SU = &SUnits[i];
2101 : if (NodesAdded.count(SU) == 0) {
2102 : NewSet.clear();
2103 188 : addConnectedNodes(SU, NewSet, NodesAdded);
2104 49 : if (!NewSet.empty())
2105 33 : NodeSets.push_back(NewSet);
2106 188 : }
2107 16 : }
2108 : }
2109 :
2110 : /// Add the node to the set, and add all is its connected nodes to the set.
2111 2221 : void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2112 : SetVector<SUnit *> &NodesAdded) {
2113 1845 : NewSet.insert(SU);
2114 : NodesAdded.insert(SU);
2115 9 : for (auto &SI : SU->Succs) {
2116 9 : SUnit *Successor = SI.getSUnit();
2117 9 : if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
2118 : addConnectedNodes(Successor, NewSet, NodesAdded);
2119 : }
2120 188 : for (auto &PI : SU->Preds) {
2121 : SUnit *Predecessor = PI.getSUnit();
2122 : if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2123 545 : addConnectedNodes(Predecessor, NewSet, NodesAdded);
2124 : }
2125 545 : }
2126 545 :
2127 1219 : /// Return true if Set1 contains elements in Set2. The elements in common
2128 : /// are returned in a different container.
2129 674 : static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2130 196 : SmallSetVector<SUnit *, 8> &Result) {
2131 : Result.clear();
2132 1254 : for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
2133 : SUnit *SU = Set1[i];
2134 709 : if (Set2.count(SU) != 0)
2135 166 : Result.insert(SU);
2136 : }
2137 545 : return !Result.empty();
2138 : }
2139 :
2140 : /// Merge the recurrence node sets that have the same initial node.
2141 306 : void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2142 : for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2143 306 : ++I) {
2144 628 : NodeSet &NI = *I;
2145 644 : for (NodeSetType::iterator J = I + 1; J != E;) {
2146 : NodeSet &NJ = *J;
2147 63 : if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2148 : if (NJ.compareRecMII(NI) > 0)
2149 306 : NI.setRecMII(NJ.getRecMII());
2150 : for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
2151 : ++NII)
2152 : I->insert(*NII);
2153 0 : NodeSets.erase(J);
2154 0 : E = NodeSets.end();
2155 : } else {
2156 : ++J;
2157 0 : }
2158 : }
2159 0 : }
2160 0 : }
2161 :
2162 0 : /// Remove nodes that have been scheduled in previous NodeSets.
2163 : void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2164 0 : for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2165 0 : ++I)
2166 : for (NodeSetType::iterator J = I + 1; J != E;) {
2167 : J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2168 0 :
2169 : if (J->empty()) {
2170 : NodeSets.erase(J);
2171 : E = NodeSets.end();
2172 0 : } else {
2173 : ++J;
2174 : }
2175 0 : }
2176 0 : }
2177 :
2178 0 : /// Compute an ordered list of the dependence graph nodes, which
2179 2960 : /// indicates the order that the nodes will be scheduled. This is a
2180 : /// two-level algorithm. First, a partial order is created, which
2181 0 : /// consists of a list of sets ordered from highest to lowest priority.
2182 0 : void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2183 : SmallSetVector<SUnit *, 8> R;
2184 : NodeOrder.clear();
2185 0 :
2186 : for (auto &Nodes : NodeSets) {
2187 : LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2188 0 : OrderKind Order;
2189 : SmallSetVector<SUnit *, 8> N;
2190 : if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
2191 : R.insert(N.begin(), N.end());
2192 : Order = BottomUp;
2193 : LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2194 188 : } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
2195 : R.insert(N.begin(), N.end());
2196 188 : Order = TopDown;
2197 : LLVM_DEBUG(dbgs() << " Top down (succs) ");
2198 693 : } else if (isIntersect(N, Nodes, R)) {
2199 : // If some of the successors are in the existing node-set, then use the
2200 : // top-down ordering.
2201 : Order = TopDown;
2202 505 : LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2203 95 : } else if (NodeSets.size() == 1) {
2204 : for (auto &N : Nodes)
2205 : if (N->Succs.size() == 0)
2206 410 : R.insert(N);
2207 104 : Order = BottomUp;
2208 : LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2209 : } else {
2210 306 : // Find the node with the highest ASAP.
2211 : SUnit *maxASAP = nullptr;
2212 : for (SUnit *SU : Nodes) {
2213 : if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2214 : (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2215 265 : maxASAP = SU;
2216 179 : }
2217 169 : R.insert(maxASAP);
2218 13 : Order = BottomUp;
2219 : LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2220 : }
2221 :
2222 : while (!R.empty()) {
2223 255 : if (Order == TopDown) {
2224 936 : // Choose the node with the maximum height. If more than one, choose
2225 681 : // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2226 211 : // choose the node with the lowest MOV.
2227 625 : while (!R.empty()) {
2228 : SUnit *maxHeight = nullptr;
2229 255 : for (SUnit *I : R) {
2230 : if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2231 : maxHeight = I;
2232 : else if (getHeight(I) == getHeight(maxHeight) &&
2233 : getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2234 1029 : maxHeight = I;
2235 524 : else if (getHeight(I) == getHeight(maxHeight) &&
2236 : getZeroLatencyHeight(I) ==
2237 : getZeroLatencyHeight(maxHeight) &&
2238 : getMOV(I) < getMOV(maxHeight))
2239 723 : maxHeight = I;
2240 567 : }
2241 2044 : NodeOrder.insert(maxHeight);
2242 2422 : LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2243 703 : R.remove(maxHeight);
2244 774 : for (const auto &I : maxHeight->Succs) {
2245 968 : if (Nodes.count(I.getSUnit()) == 0)
2246 33 : continue;
2247 1192 : if (NodeOrder.count(I.getSUnit()) != 0)
2248 451 : continue;
2249 741 : if (ignoreDependence(I, false))
2250 : continue;
2251 19 : R.insert(I.getSUnit());
2252 : }
2253 567 : // Back-edges are predecessors with an anti-dependence.
2254 : for (const auto &I : maxHeight->Preds) {
2255 567 : if (I.getKind() != SDep::Anti)
2256 1252 : continue;
2257 : if (Nodes.count(I.getSUnit()) == 0)
2258 155 : continue;
2259 530 : if (NodeOrder.count(I.getSUnit()) != 0)
2260 : continue;
2261 : R.insert(I.getSUnit());
2262 : }
2263 421 : }
2264 : Order = BottomUp;
2265 : LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2266 1415 : SmallSetVector<SUnit *, 8> N;
2267 848 : if (pred_L(NodeOrder, N, &Nodes))
2268 : R.insert(N.begin(), N.end());
2269 : } else {
2270 6 : // Choose the node with the maximum depth. If more than one, choose
2271 60 : // the node with the maximum ZeroLatencyDepth. If still more than one,
2272 : // choose the node with the lowest MOV.
2273 58 : while (!R.empty()) {
2274 : SUnit *maxDepth = nullptr;
2275 : for (SUnit *I : R) {
2276 : if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2277 : maxDepth = I;
2278 : else if (getDepth(I) == getDepth(maxDepth) &&
2279 156 : getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2280 8 : maxDepth = I;
2281 : else if (getDepth(I) == getDepth(maxDepth) &&
2282 : getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2283 : getMOV(I) < getMOV(maxDepth))
2284 : maxDepth = I;
2285 1645 : }
2286 1278 : NodeOrder.insert(maxDepth);
2287 4354 : LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2288 4882 : R.remove(maxDepth);
2289 1635 : if (Nodes.isExceedSU(maxDepth)) {
2290 1441 : Order = TopDown;
2291 1608 : R.clear();
2292 48 : R.insert(Nodes.getNode(0));
2293 2149 : break;
2294 1393 : }
2295 : for (const auto &I : maxDepth->Preds) {
2296 172 : if (Nodes.count(I.getSUnit()) == 0)
2297 : continue;
2298 1278 : if (NodeOrder.count(I.getSUnit()) != 0)
2299 : continue;
2300 1278 : R.insert(I.getSUnit());
2301 1278 : }
2302 : // Back-edges are predecessors with an anti-dependence.
2303 : for (const auto &I : maxDepth->Succs) {
2304 1 : if (I.getKind() != SDep::Anti)
2305 1 : continue;
2306 : if (Nodes.count(I.getSUnit()) == 0)
2307 3030 : continue;
2308 : if (NodeOrder.count(I.getSUnit()) != 0)
2309 429 : continue;
2310 1324 : R.insert(I.getSUnit());
2311 : }
2312 1236 : }
2313 : Order = TopDown;
2314 : LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2315 3194 : SmallSetVector<SUnit *, 8> N;
2316 1917 : if (succ_L(NodeOrder, N, &Nodes))
2317 : R.insert(N.begin(), N.end());
2318 : }
2319 38 : }
2320 263 : LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2321 : }
2322 47 :
2323 : LLVM_DEBUG({
2324 : dbgs() << "Node order: ";
2325 : for (SUnit *I : NodeOrder)
2326 : dbgs() << " " << I->NodeNum << " ";
2327 : dbgs() << "\n";
2328 368 : });
2329 11 : }
2330 :
2331 : /// Process the nodes in the computed order and create the pipelined schedule
2332 : /// of the instructions, if possible. Return true if a schedule is found.
2333 : bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2334 : if (NodeOrder.empty())
2335 : return false;
2336 :
2337 : bool scheduleFound = false;
2338 : // Keep increasing II until a valid schedule is found.
2339 : for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
2340 : Schedule.reset();
2341 188 : Schedule.setInitiationInterval(II);
2342 : LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2343 :
2344 : SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2345 188 : SetVector<SUnit *>::iterator NE = NodeOrder.end();
2346 188 : do {
2347 : SUnit *SU = *NI;
2348 :
2349 : // Compute the schedule time for the instruction, which is based
2350 : // upon the scheduled time for any predecessors/successors.
2351 452 : int EarlyStart = INT_MIN;
2352 272 : int LateStart = INT_MAX;
2353 272 : // These values are set when the size of the schedule window is limited
2354 : // due to chain dependences.
2355 : int SchedEnd = INT_MAX;
2356 : int SchedStart = INT_MIN;
2357 : Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2358 : II, this);
2359 3030 : LLVM_DEBUG({
2360 : dbgs() << "Inst (" << SU->NodeNum << ") ";
2361 : SU->getInstr()->dump();
2362 : dbgs() << "\n";
2363 3030 : });
2364 3030 : LLVM_DEBUG({
2365 : dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2366 : << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2367 3030 : });
2368 3030 :
2369 3030 : if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2370 : SchedStart > LateStart)
2371 : scheduleFound = false;
2372 : else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2373 : SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2374 : scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2375 : } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2376 : SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2377 : scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2378 : } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2379 : SchedEnd =
2380 : std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2381 3030 : // When scheduling a Phi it is better to start at the late cycle and go
2382 2982 : // backwards. The default order may insert the Phi too far away from
2383 : // its first dependence.
2384 2952 : if (SU->getInstr()->isPHI())
2385 698 : scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2386 698 : else
2387 2254 : scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2388 1498 : } else {
2389 1498 : int FirstCycle = Schedule.getFirstCycle();
2390 756 : scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2391 418 : FirstCycle + getASAP(SU) + II - 1, II);
2392 418 : }
2393 : // Even if we find a schedule, make sure the schedule doesn't exceed the
2394 : // allowable number of stages. We keep trying if this happens.
2395 : if (scheduleFound)
2396 418 : if (SwpMaxStages > -1 &&
2397 335 : Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2398 : scheduleFound = false;
2399 83 :
2400 : LLVM_DEBUG({
2401 338 : if (!scheduleFound)
2402 338 : dbgs() << "\tCan't schedule\n";
2403 676 : });
2404 : } while (++NI != NE && scheduleFound);
2405 :
2406 : // If a schedule is found, check if it is a valid schedule too.
2407 2952 : if (scheduleFound)
2408 2940 : scheduleFound = Schedule.isValidSchedule(this);
2409 2940 : }
2410 :
2411 : LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2412 :
2413 : if (scheduleFound)
2414 : Schedule.finalizeSchedule(this);
2415 : else
2416 3030 : Schedule.reset();
2417 :
2418 : return scheduleFound && Schedule.getMaxStageCount() > 0;
2419 272 : }
2420 175 :
2421 : /// Given a schedule for the loop, generate a new version of the loop,
2422 : /// and replace the old version. This function generates a prolog
2423 : /// that contains the initial iterations in the pipeline, and kernel
2424 : /// loop, and the epilogue that contains the code for the final
2425 180 : /// iterations.
2426 175 : void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2427 : // Create a new basic block for the kernel and add it to the CFG.
2428 5 : MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2429 :
2430 180 : unsigned MaxStageCount = Schedule.getMaxStageCount();
2431 :
2432 : // Remember the registers that are used in different stages. The index is
2433 : // the iteration, or stage, that the instruction is scheduled in. This is
2434 : // a map between register names in the original block and the names created
2435 : // in each stage of the pipelined loop.
2436 : ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2437 : InstrMapTy InstrMap;
2438 125 :
2439 : SmallVector<MachineBasicBlock *, 4> PrologBBs;
2440 125 : // Generate the prolog instructions that set up the pipeline.
2441 : generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2442 : MF.insert(BB->getIterator(), KernelBB);
2443 :
2444 : // Rearrange the instructions to generate the new, pipelined loop,
2445 : // and update register names as needed.
2446 : for (int Cycle = Schedule.getFirstCycle(),
2447 : LastCycle = Schedule.getFinalCycle();
2448 715 : Cycle <= LastCycle; ++Cycle) {
2449 : std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2450 : // This inner loop schedules each instruction in the cycle.
2451 : for (SUnit *CI : CycleInstrs) {
2452 : if (CI->getInstr()->isPHI())
2453 125 : continue;
2454 125 : unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2455 : MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2456 : updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2457 : KernelBB->push_back(NewMI);
2458 347 : InstrMap[NewMI] = CI->getInstr();
2459 125 : }
2460 472 : }
2461 347 :
2462 : // Copy any terminator instructions to the new kernel, and update
2463 1602 : // names as needed.
2464 1255 : for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2465 268 : E = BB->instr_end();
2466 987 : I != E; ++I) {
2467 987 : MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2468 987 : updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2469 987 : KernelBB->push_back(NewMI);
2470 987 : InstrMap[NewMI] = &*I;
2471 : }
2472 :
2473 : KernelBB->transferSuccessors(BB);
2474 : KernelBB->replaceSuccessor(BB, KernelBB);
2475 :
2476 125 : generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2477 125 : VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2478 375 : generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2479 250 : InstrMap, MaxStageCount, MaxStageCount, false);
2480 250 :
2481 250 : LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2482 250 :
2483 : SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2484 : // Generate the epilog instructions to complete the pipeline.
2485 125 : generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2486 125 : PrologBBs);
2487 :
2488 125 : // We need this step because the register allocation doesn't handle some
2489 : // situations well, so we insert copies to help out.
2490 125 : splitLifetimes(KernelBB, EpilogBBs, Schedule);
2491 :
2492 : // Remove dead instructions due to loop induction variables.
2493 : removeDeadInstructions(KernelBB, EpilogBBs);
2494 :
2495 : // Add branches between prolog and epilog blocks.
2496 : addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2497 125 :
2498 : // Remove the original loop since it's no longer referenced.
2499 : for (auto &I : *BB)
2500 : LIS.RemoveMachineInstrFromMaps(I);
2501 : BB->clear();
2502 125 : BB->eraseFromParent();
2503 :
2504 : delete[] VRMap;
2505 125 : }
2506 :
2507 : /// Generate the pipeline prolog code.
2508 125 : void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2509 : MachineBasicBlock *KernelBB,
2510 : ValueMapTy *VRMap,
2511 1630 : MBBVectorTy &PrologBBs) {
2512 1505 : MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2513 125 : assert(PreheaderBB != nullptr &&
2514 125 : "Need to add code to handle loops w/o preheader");
2515 : MachineBasicBlock *PredBB = PreheaderBB;
2516 250 : InstrMapTy InstrMap;
2517 125 :
2518 : // Generate a basic block for each stage, not including the last stage,
2519 : // which will be generated in the kernel. Each basic block may contain
2520 125 : // instructions from multiple stages/iterations.
2521 : for (unsigned i = 0; i < LastStage; ++i) {
2522 : // Create and insert the prolog basic block prior to the original loop
2523 : // basic block. The original loop is removed later.
2524 250 : MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2525 : PrologBBs.push_back(NewBB);
2526 : MF.insert(BB->getIterator(), NewBB);
2527 : NewBB->transferSuccessors(PredBB);
2528 : PredBB->addSuccessor(NewBB);
2529 : PredBB = NewBB;
2530 :
2531 : // Generate instructions for each appropriate stage. Process instructions
2532 : // in original program order.
2533 295 : for (int StageNum = i; StageNum >= 0; --StageNum) {
2534 : for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2535 : BBE = BB->getFirstTerminator();
2536 170 : BBI != BBE; ++BBI) {
2537 170 : if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2538 170 : if (BBI->isPHI())
2539 170 : continue;
2540 170 : MachineInstr *NewMI =
2541 170 : cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2542 : updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2543 : VRMap);
2544 : NewBB->push_back(NewMI);
2545 394 : InstrMap[NewMI] = &*BBI;
2546 224 : }
2547 224 : }
2548 2659 : }
2549 4870 : rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2550 : LLVM_DEBUG({
2551 259 : dbgs() << "prolog:\n";
2552 : NewBB->dump();
2553 776 : });
2554 776 : }
2555 :
2556 776 : PredBB->replaceSuccessor(BB, KernelBB);
2557 776 :
2558 : // Check if we need to remove the branch from the preheader to the original
2559 : // loop, and replace it with a branch to the new loop.
2560 : unsigned numBranches = TII->removeBranch(*PreheaderBB);
2561 170 : if (numBranches) {
2562 : SmallVector<MachineOperand, 0> Cond;
2563 : TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2564 : }
2565 : }
2566 :
2567 : /// Generate the pipeline epilog code. The epilog code finishes the iterations
2568 125 : /// that were started in either the prolog or the kernel. We create a basic
2569 : /// block for each stage that needs to complete.
2570 : void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2571 : MachineBasicBlock *KernelBB,
2572 125 : ValueMapTy *VRMap,
2573 125 : MBBVectorTy &EpilogBBs,
2574 : MBBVectorTy &PrologBBs) {
2575 70 : // We need to change the branch from the kernel to the first epilog block, so
2576 : // this call to analyze branch uses the kernel rather than the original BB.
2577 125 : MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2578 : SmallVector<MachineOperand, 4> Cond;
2579 : bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2580 : assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2581 : if (checkBranch)
2582 125 : return;
2583 :
2584 : MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2585 : if (*LoopExitI == KernelBB)
2586 : ++LoopExitI;
2587 : assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2588 : MachineBasicBlock *LoopExitBB = *LoopExitI;
2589 125 :
2590 : MachineBasicBlock *PredBB = KernelBB;
2591 125 : MachineBasicBlock *EpilogStart = LoopExitBB;
2592 : InstrMapTy InstrMap;
2593 125 :
2594 : // Generate a basic block for each stage, not including the last stage,
2595 : // which was generated for the kernel. Each basic block may contain
2596 : // instructions from multiple stages/iterations.
2597 125 : int EpilogStage = LastStage + 1;
2598 : for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2599 : MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2600 125 : EpilogBBs.push_back(NewBB);
2601 : MF.insert(BB->getIterator(), NewBB);
2602 :
2603 : PredBB->replaceSuccessor(LoopExitBB, NewBB);
2604 : NewBB->addSuccessor(LoopExitBB);
2605 :
2606 : if (EpilogStart == LoopExitBB)
2607 : EpilogStart = NewBB;
2608 :
2609 125 : // Add instructions to the epilog depending on the current block.
2610 295 : // Process instructions in original program order.
2611 170 : for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2612 170 : for (auto &BBI : *BB) {
2613 170 : if (BBI.isPHI())
2614 : continue;
2615 170 : MachineInstr *In = &BBI;
2616 170 : if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2617 : // Instructions with memoperands in the epilog are updated with
2618 170 : // conservative values.
2619 125 : MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
2620 : updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2621 : NewBB->push_back(NewMI);
2622 : InstrMap[NewMI] = In;
2623 394 : }
2624 3107 : }
2625 : }
2626 : generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2627 : VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2628 2353 : generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2629 : InstrMap, LastStage, EpilogStage, i == 1);
2630 : PredBB = NewBB;
2631 643 :
2632 643 : LLVM_DEBUG({
2633 643 : dbgs() << "epilog:\n";
2634 643 : NewBB->dump();
2635 : });
2636 : }
2637 :
2638 340 : // Fix any Phi nodes in the loop exit block.
2639 : for (MachineInstr &MI : *LoopExitBB) {
2640 170 : if (!MI.isPHI())
2641 : break;
2642 170 : for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2643 : MachineOperand &MO = MI.getOperand(i);
2644 : if (MO.getMBB() == BB)
2645 : MO.setMBB(PredBB);
2646 : }
2647 : }
2648 :
2649 : // Create a branch to the new epilog from the kernel.
2650 : // Remove the original branch and add a new branch to the epilog.
2651 139 : TII->removeBranch(*KernelBB);
2652 : TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2653 : // Add a branch to the loop exit.
2654 45 : if (EpilogBBs.size() > 0) {
2655 31 : MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2656 31 : SmallVector<MachineOperand, 4> Cond1;
2657 : TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2658 : }
2659 : }
2660 :
2661 : /// Replace all uses of FromReg that appear outside the specified
2662 : /// basic block with ToReg.
2663 125 : static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2664 250 : MachineBasicBlock *MBB,
2665 : MachineRegisterInfo &MRI,
2666 250 : LiveIntervals &LIS) {
2667 125 : for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2668 : E = MRI.use_end();
2669 250 : I != E;) {
2670 : MachineOperand &O = *I;
2671 : ++I;
2672 : if (O.getParent()->getParent() != MBB)
2673 : O.setReg(ToReg);
2674 : }
2675 960 : if (!LIS.hasInterval(ToReg))
2676 : LIS.createEmptyInterval(ToReg);
2677 : }
2678 :
2679 960 : /// Return true if the register has a use that occurs outside the
2680 : /// specified loop.
2681 2324 : static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2682 : MachineRegisterInfo &MRI) {
2683 : for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2684 1364 : E = MRI.use_end();
2685 70 : I != E; ++I)
2686 : if (I->getParent()->getParent() != BB)
2687 : return true;
2688 960 : return false;
2689 960 : }
2690 :
2691 : /// Generate Phis for the specific block in the generated pipelined code.
2692 : /// This function looks at the Phis from the original code to guide the
2693 370 : /// creation of new Phis.
2694 : void SwingSchedulerDAG::generateExistingPhis(
2695 370 : MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2696 : MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2697 780 : InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2698 418 : bool IsLast) {
2699 : // Compute the stage number for the initial value of the Phi, which
2700 : // comes from the prolog. The prolog to use depends on to which kernel/
2701 : // epilog that we're adding the Phi.
2702 : unsigned PrologStage = 0;
2703 : unsigned PrevStage = 0;
2704 : bool InKernel = (LastStageNum == CurStageNum);
2705 : if (InKernel) {
2706 295 : PrologStage = LastStageNum - 1;
2707 : PrevStage = CurStageNum;
2708 : } else {
2709 : PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2710 : PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2711 : }
2712 :
2713 : for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2714 : BBE = BB->getFirstNonPHI();
2715 : BBI != BBE; ++BBI) {
2716 295 : unsigned Def = BBI->getOperand(0).getReg();
2717 295 :
2718 125 : unsigned InitVal = 0;
2719 : unsigned LoopVal = 0;
2720 : getPhiRegs(*BBI, BB, InitVal, LoopVal);
2721 170 :
2722 170 : unsigned PhiOp1 = 0;
2723 : // The Phi value from the loop body typically is defined in the loop, but
2724 : // not always. So, we need to check if the value is defined in the loop.
2725 295 : unsigned PhiOp2 = LoopVal;
2726 295 : if (VRMap[LastStageNum].count(LoopVal))
2727 953 : PhiOp2 = VRMap[LastStageNum][LoopVal];
2728 658 :
2729 : int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2730 : int LoopValStage =
2731 658 : Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2732 658 : unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2733 : if (NumStages == 0) {
2734 658 : // We don't need to generate a Phi anymore, but we need to rename any uses
2735 : // of the Phi value.
2736 : unsigned NewReg = VRMap[PrevStage][LoopVal];
2737 658 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2738 658 : Def, InitVal, NewReg);
2739 650 : if (VRMap[CurStageNum].count(LoopVal))
2740 : VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2741 658 : }
2742 : // Adjust the number of Phis needed depending on the number of prologs left,
2743 658 : // and the distance from where the Phi is first scheduled. The number of
2744 658 : // Phis cannot exceed the number of prolog stages. Each stage can
2745 658 : // potentially define two values.
2746 : unsigned MaxPhis = PrologStage + 2;
2747 : if (!InKernel && (int)PrologStage <= LoopValStage)
2748 5 : MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
2749 5 : unsigned NumPhis = std::min(NumStages, MaxPhis);
2750 :
2751 5 : unsigned NewReg = 0;
2752 5 : unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2753 : // In the epilog, we may need to look back one stage to get the correct
2754 : // Phi name because the epilog and prolog blocks execute the same stage.
2755 : // The correct name is from the previous block only when the Phi has
2756 : // been completely scheduled prior to the epilog, and Phi value is not
2757 : // needed in multiple stages.
2758 658 : int StageDiff = 0;
2759 658 : if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2760 366 : NumPhis == 1)
2761 658 : StageDiff = 1;
2762 : // Adjust the computations below when the phi and the loop definition
2763 : // are scheduled in different stages.
2764 658 : if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2765 : StageDiff = StageScheduled - LoopValStage;
2766 : for (unsigned np = 0; np < NumPhis; ++np) {
2767 : // If the Phi hasn't been scheduled, then use the initial Phi operand
2768 : // value. Otherwise, use the scheduled version of the instruction. This
2769 : // is a little complicated when a Phi references another Phi.
2770 : if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2771 658 : PhiOp1 = InitVal;
2772 368 : // Check if the Phi has already been scheduled in a prolog stage.
2773 : else if (PrologStage >= AccessStage + StageDiff + np &&
2774 : VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2775 : PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2776 658 : // Check if the Phi has already been scheduled, but the loop intruction
2777 1 : // is either another Phi, or doesn't occur in the loop.
2778 1376 : else if (PrologStage >= AccessStage + StageDiff + np) {
2779 : // If the Phi references another Phi, we need to examine the other
2780 : // Phi to get the correct value.
2781 : PhiOp1 = LoopVal;
2782 718 : MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2783 228 : int Indirects = 1;
2784 : while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2785 490 : int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2786 307 : if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2787 279 : PhiOp1 = getInitPhiReg(*InstOp1, BB);
2788 : else
2789 : PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2790 211 : InstOp1 = MRI.getVRegDef(PhiOp1);
2791 : int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2792 : int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2793 28 : if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2794 28 : VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2795 : PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2796 75 : break;
2797 28 : }
2798 28 : ++Indirects;
2799 36 : }
2800 : } else
2801 20 : PhiOp1 = InitVal;
2802 28 : // If this references a generated Phi in the kernel, get the Phi operand
2803 18 : // from the incoming block.
2804 10 : if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2805 28 : if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2806 10 : PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2807 10 :
2808 10 : MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2809 : bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2810 18 : // In the epilog, a map lookup is needed to get the value from the kernel,
2811 : // or previous epilog block. How is does this depends on if the
2812 : // instruction is scheduled in the previous block.
2813 183 : if (!InKernel) {
2814 : int StageDiffAdj = 0;
2815 : if (LoopValStage != -1 && StageScheduled > LoopValStage)
2816 718 : StageDiffAdj = StageScheduled - LoopValStage;
2817 68 : // Use the loop value defined in the kernel, unless the kernel
2818 8 : // contains the last definition of the Phi.
2819 : if (np == 0 && PrevStage == LastStageNum &&
2820 718 : (StageScheduled != 0 || LoopValStage != 0) &&
2821 718 : VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2822 : PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2823 : // Use the value defined by the Phi. We add one because we switch
2824 : // from looking at the loop value to the Phi definition.
2825 718 : else if (np > 0 && PrevStage == LastStageNum &&
2826 : VRMap[PrevStage - np + 1].count(Def))
2827 428 : PhiOp2 = VRMap[PrevStage - np + 1][Def];
2828 3 : // Use the loop value defined in the kernel.
2829 : else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
2830 : VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2831 696 : PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2832 428 : // Use the value defined by the Phi, unless we're generating the first
2833 104 : // epilog and the Phi refers to a Phi in a different stage.
2834 104 : else if (VRMap[PrevStage - np].count(Def) &&
2835 : (!LoopDefIsPhi || PrevStage != LastStageNum))
2836 : PhiOp2 = VRMap[PrevStage - np][Def];
2837 324 : }
2838 27 :
2839 27 : // Check if we can reuse an existing Phi. This occurs when a Phi
2840 : // references another Phi, and the other Phi is scheduled in an
2841 297 : // earlier stage. We can try to reuse an existing Phi up until the last
2842 38 : // stage of the current Phi.
2843 38 : if (LoopDefIsPhi) {
2844 : if (static_cast<int>(PrologStage - np) >= StageScheduled) {
2845 : int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2846 259 : int StageDiff = (StageScheduled - LoopValStage);
2847 259 : LVNumStages -= StageDiff;
2848 244 : // Make sure the loop value Phi has been processed already.
2849 : if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
2850 : NewReg = PhiOp2;
2851 : unsigned ReuseStage = CurStageNum;
2852 : if (Schedule.isLoopCarried(this, *PhiInst))
2853 : ReuseStage -= LVNumStages;
2854 : // Check if the Phi to reuse has been generated yet. If not, then
2855 718 : // there is nothing to reuse.
2856 51 : if (VRMap[ReuseStage - np].count(LoopVal)) {
2857 42 : NewReg = VRMap[ReuseStage - np][LoopVal];
2858 42 :
2859 42 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2860 : &*BBI, Def, NewReg);
2861 42 : // Update the map with the new Phi name.
2862 : VRMap[CurStageNum - np][Def] = NewReg;
2863 : PhiOp2 = NewReg;
2864 0 : if (VRMap[LastStageNum - np - 1].count(LoopVal))
2865 0 : PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2866 :
2867 : if (IsLast && np == NumPhis - 1)
2868 0 : replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2869 0 : continue;
2870 : }
2871 0 : }
2872 : }
2873 : if (InKernel && StageDiff > 0 &&
2874 0 : VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2875 : PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2876 0 : }
2877 0 :
2878 : const TargetRegisterClass *RC = MRI.getRegClass(Def);
2879 0 : NewReg = MRI.createVirtualRegister(RC);
2880 0 :
2881 0 : MachineInstrBuilder NewPhi =
2882 : BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2883 : TII->get(TargetOpcode::PHI), NewReg);
2884 : NewPhi.addReg(PhiOp1).addMBB(BB1);
2885 51 : NewPhi.addReg(PhiOp2).addMBB(BB2);
2886 1 : if (np == 0)
2887 1 : InstrMap[NewPhi] = &*BBI;
2888 :
2889 : // We define the Phis after creating the new pipelined code, so
2890 718 : // we need to rename the Phi values in scheduled instructions.
2891 718 :
2892 : unsigned PrevReg = 0;
2893 : if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2894 1436 : PrevReg = VRMap[PrevStage - np][LoopVal];
2895 718 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2896 718 : Def, NewReg, PrevReg);
2897 718 : // If the Phi has been scheduled, use the new name for rewriting.
2898 718 : if (VRMap[CurStageNum - np].count(Def)) {
2899 653 : unsigned R = VRMap[CurStageNum - np][Def];
2900 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2901 : R, NewReg);
2902 : }
2903 :
2904 : // Check if we need to rename any uses that occurs after the loop. The
2905 718 : // register to replace depends on whether the Phi is scheduled in the
2906 280 : // epilog.
2907 718 : if (IsLast && np == NumPhis - 1)
2908 : replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2909 :
2910 718 : // In the kernel, a dependent Phi uses the value from this Phi.
2911 38 : if (InKernel)
2912 38 : PhiOp2 = NewReg;
2913 :
2914 : // Update the map with the new Phi name.
2915 : VRMap[CurStageNum - np][Def] = NewReg;
2916 : }
2917 :
2918 : while (NumPhis++ < NumStages) {
2919 718 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2920 268 : &*BBI, Def, NewReg, 0);
2921 : }
2922 :
2923 718 : // Check if we need to rename a Phi that has been eliminated due to
2924 : // scheduling.
2925 : if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2926 : replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2927 718 : }
2928 : }
2929 :
2930 668 : /// Generate Phis for the specified block in the generated pipelined code.
2931 10 : /// These are new Phis needed because the definition is scheduled after the
2932 : /// use in the pipelined sequence.
2933 : void SwingSchedulerDAG::generatePhis(
2934 : MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2935 : MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2936 : InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2937 658 : bool IsLast) {
2938 0 : // Compute the stage number that contains the initial Phi value, and
2939 : // the Phi from the previous stage.
2940 295 : unsigned PrologStage = 0;
2941 : unsigned PrevStage = 0;
2942 : unsigned StageDiff = CurStageNum - LastStageNum;
2943 : bool InKernel = (StageDiff == 0);
2944 : if (InKernel) {
2945 295 : PrologStage = LastStageNum - 1;
2946 : PrevStage = CurStageNum;
2947 : } else {
2948 : PrologStage = LastStageNum - StageDiff;
2949 : PrevStage = LastStageNum + StageDiff - 1;
2950 : }
2951 :
2952 : for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2953 : BBE = BB->instr_end();
2954 295 : BBI != BBE; ++BBI) {
2955 : for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2956 295 : MachineOperand &MO = BBI->getOperand(i);
2957 125 : if (!MO.isReg() || !MO.isDef() ||
2958 : !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2959 : continue;
2960 170 :
2961 170 : int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2962 : assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2963 : unsigned Def = MO.getReg();
2964 295 : unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2965 295 : // An instruction scheduled in stage 0 and is used after the loop
2966 3291 : // requires a phi in the epilog for the last definition from either
2967 13248 : // the kernel or prolog.
2968 10252 : if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2969 10252 : hasUseAfterLoop(Def, BB, MRI))
2970 3376 : NumPhis = 1;
2971 8401 : if (!InKernel && (unsigned)StageScheduled > PrologStage)
2972 : continue;
2973 2420 :
2974 : unsigned PhiOp2 = VRMap[PrevStage][Def];
2975 2420 : if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2976 2420 : if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2977 : PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2978 : // The number of Phis can't exceed the number of prolog stages. The
2979 : // prolog stage number is zero based.
2980 2790 : if (NumPhis > PrologStage + 1 - StageScheduled)
2981 370 : NumPhis = PrologStage + 1 - StageScheduled;
2982 : for (unsigned np = 0; np < NumPhis; ++np) {
2983 2420 : unsigned PhiOp1 = VRMap[PrologStage][Def];
2984 : if (np <= PrologStage)
2985 : PhiOp1 = VRMap[PrologStage - np][Def];
2986 1851 : if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2987 1851 : if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2988 88 : PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2989 : if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2990 : PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2991 : }
2992 1851 : if (!InKernel)
2993 : PhiOp2 = VRMap[PrevStage - np][Def];
2994 2524 :
2995 673 : const TargetRegisterClass *RC = MRI.getRegClass(Def);
2996 673 : unsigned NewReg = MRI.createVirtualRegister(RC);
2997 673 :
2998 673 : MachineInstrBuilder NewPhi =
2999 306 : BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
3000 : TII->get(TargetOpcode::PHI), NewReg);
3001 306 : NewPhi.addReg(PhiOp1).addMBB(BB1);
3002 : NewPhi.addReg(PhiOp2).addMBB(BB2);
3003 : if (np == 0)
3004 673 : InstrMap[NewPhi] = &*BBI;
3005 385 :
3006 : // Rewrite uses and update the map. The actions depend upon whether
3007 673 : // we generating code for the kernel or epilog blocks.
3008 673 : if (InKernel) {
3009 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
3010 : &*BBI, PhiOp1, NewReg);
3011 1346 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
3012 673 : &*BBI, PhiOp2, NewReg);
3013 673 :
3014 673 : PhiOp2 = NewReg;
3015 673 : VRMap[PrevStage - np - 1][Def] = NewReg;
3016 636 : } else {
3017 : VRMap[CurStageNum - np][Def] = NewReg;
3018 : if (np == NumPhis - 1)
3019 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
3020 673 : &*BBI, Def, NewReg);
3021 288 : }
3022 : if (IsLast && np == NumPhis - 1)
3023 288 : replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
3024 : }
3025 : }
3026 : }
3027 288 : }
3028 :
3029 385 : /// Remove instructions that generate values with no uses.
3030 385 : /// Typically, these are induction variable operations that generate values
3031 365 : /// used in the loop itself. A dead instruction has a definition with
3032 : /// no uses, or uses that occur in the original loop only.
3033 : void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
3034 673 : MBBVectorTy &EpilogBBs) {
3035 221 : // For each epilog block, check that the value defined by each instruction
3036 : // is used. If not, delete it.
3037 : for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
3038 : MBE = EpilogBBs.rend();
3039 295 : MBB != MBE; ++MBB)
3040 : for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
3041 : ME = (*MBB)->instr_rend();
3042 : MI != ME;) {
3043 : // From DeadMachineInstructionElem. Don't delete inline assembly.
3044 : if (MI->isInlineAsm()) {
3045 125 : ++MI;
3046 : continue;
3047 : }
3048 : bool SawStore = false;
3049 : // Check if it's safe to remove the instruction due to side effects.
3050 : // We can, and want to, remove Phis here.
3051 295 : if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
3052 170 : ++MI;
3053 : continue;
3054 1751 : }
3055 : bool used = true;
3056 1581 : for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
3057 : MOE = MI->operands_end();
3058 523 : MOI != MOE; ++MOI) {
3059 : if (!MOI->isReg() || !MOI->isDef())
3060 1581 : continue;
3061 : unsigned reg = MOI->getReg();
3062 : // Assume physical registers are used, unless they are marked dead.
3063 1581 : if (TargetRegisterInfo::isPhysicalRegister(reg)) {
3064 : used = !MOI->isDead();
3065 268 : if (used)
3066 : break;
3067 : continue;
3068 1252 : }
3069 1313 : unsigned realUses = 0;
3070 2565 : for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
3071 2310 : EI = MRI.use_end();
3072 : UI != EI; ++UI) {
3073 1313 : // Check if there are any uses that occur only in the original
3074 : // loop. If so, that's not a real use.
3075 1313 : if (UI->getParent()->getParent() != BB) {
3076 0 : realUses++;
3077 0 : used = true;
3078 : break;
3079 : }
3080 : }
3081 : if (realUses > 0)
3082 1313 : break;
3083 : used = false;
3084 1313 : }
3085 : if (!used) {
3086 : LIS.RemoveMachineInstrFromMaps(*MI);
3087 1058 : MI++->eraseFromParent();
3088 : continue;
3089 : }
3090 : ++MI;
3091 : }
3092 : // In the kernel block, check if we can remove a Phi that generates a value
3093 1313 : // used in an instruction removed in the epilog block.
3094 : for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
3095 : BBE = KernelBB->getFirstNonPHI();
3096 : BBI != BBE;) {
3097 1313 : MachineInstr *MI = &*BBI;
3098 255 : ++BBI;
3099 255 : unsigned reg = MI->getOperand(0).getReg();
3100 255 : if (MRI.use_begin(reg) == MRI.use_end()) {
3101 : LIS.RemoveMachineInstrFromMaps(*MI);
3102 : MI->eraseFromParent();
3103 : }
3104 : }
3105 : }
3106 :
3107 125 : /// For loop carried definitions, we split the lifetime of a virtual register
3108 703 : /// that has uses past the definition in the next iteration. A copy with a new
3109 : /// virtual register is inserted before the definition, which helps with
3110 : /// generating a better register assignment.
3111 578 : ///
3112 578 : /// v1 = phi(a, v2) v1 = phi(a, v2)
3113 0 : /// v2 = phi(b, v3) v2 = phi(b, v3)
3114 0 : /// v3 = .. v4 = copy v1
3115 : /// .. = V1 v3 = ..
3116 : /// .. = v4
3117 125 : void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
3118 : MBBVectorTy &EpilogBBs,
3119 : SMSchedule &Schedule) {
3120 : const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3121 : for (auto &PHI : KernelBB->phis()) {
3122 : unsigned Def = PHI.getOperand(0).getReg();
3123 : // Check for any Phi definition that used as an operand of another Phi
3124 : // in the same block.
3125 : for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
3126 : E = MRI.use_instr_end();
3127 : I != E; ++I) {
3128 : if (I->isPHI() && I->getParent() == KernelBB) {
3129 0 : // Get the loop carried definition.
3130 : unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
3131 : if (!LCDef)
3132 0 : continue;
3133 0 : MachineInstr *MI = MRI.getVRegDef(LCDef);
3134 0 : if (!MI || MI->getParent() != KernelBB || MI->isPHI())
3135 : continue;
3136 : // Search through the rest of the block looking for uses of the Phi
3137 0 : // definition. If one occurs, then split the lifetime.
3138 : unsigned SplitReg = 0;
3139 0 : for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
3140 0 : KernelBB->instr_end()))
3141 : if (BBJ.readsRegister(Def)) {
3142 : // We split the lifetime when we find the first use.
3143 0 : if (SplitReg == 0) {
3144 0 : SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
3145 0 : BuildMI(*KernelBB, MI, MI->getDebugLoc(),
3146 0 : TII->get(TargetOpcode::COPY), SplitReg)
3147 0 : .addReg(Def);
3148 : }
3149 : BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
3150 : }
3151 : if (!SplitReg)
3152 0 : continue;
3153 0 : // Search through each of the epilog blocks for any uses to be renamed.
3154 : for (auto &Epilog : EpilogBBs)
3155 0 : for (auto &I : *Epilog)
3156 0 : if (I.readsRegister(Def))
3157 0 : I.substituteRegister(Def, SplitReg, 0, *TRI);
3158 0 : break;
3159 0 : }
3160 : }
3161 0 : }
3162 : }
3163 0 :
3164 0 : /// Remove the incoming block from the Phis in a basic block.
3165 : static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
3166 0 : for (MachineInstr &MI : *BB) {
3167 0 : if (!MI.isPHI())
3168 0 : break;
3169 0 : for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
3170 : if (MI.getOperand(i + 1).getMBB() == Incoming) {
3171 : MI.RemoveOperand(i + 1);
3172 : MI.RemoveOperand(i);
3173 : break;
3174 0 : }
3175 : }
3176 : }
3177 41 :
3178 176 : /// Create branches from each prolog basic block to the appropriate epilog
3179 : /// block. These edges are needed if the loop ends before reaching the
3180 : /// kernel.
3181 161 : void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
3182 322 : MachineBasicBlock *KernelBB,
3183 135 : MBBVectorTy &EpilogBBs,
3184 135 : SMSchedule &Schedule, ValueMapTy *VRMap) {
3185 135 : assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
3186 : MachineInstr *IndVar = Pass.LI.LoopInductionVar;
3187 : MachineInstr *Cmp = Pass.LI.LoopCompare;
3188 41 : MachineBasicBlock *LastPro = KernelBB;
3189 : MachineBasicBlock *LastEpi = KernelBB;
3190 :
3191 : // Start from the blocks connected to the kernel and work "out"
3192 : // to the first prolog and the last epilog blocks.
3193 125 : SmallVector<MachineInstr *, 4> PrevInsts;
3194 : unsigned MaxIter = PrologBBs.size() - 1;
3195 : unsigned LC = UINT_MAX;
3196 : unsigned LCMin = UINT_MAX;
3197 : for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
3198 125 : // Add branches to the prolog that go to the corresponding
3199 125 : // epilog, and the fall-thru prolog/kernel block.
3200 : MachineBasicBlock *Prolog = PrologBBs[j];
3201 : MachineBasicBlock *Epilog = EpilogBBs[i];
3202 : // We've executed one iteration, so decrement the loop count and check for
3203 : // the loop end.
3204 : SmallVector<MachineOperand, 4> Cond;
3205 : // Check if the LOOP0 has already been removed. If so, then there is no need
3206 125 : // to reduce the trip count.
3207 : if (LC != 0)
3208 : LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
3209 295 : MaxIter);
3210 :
3211 : // Record the value of the first trip count, which is used to determine if
3212 170 : // branches and blocks can be removed for constant trip counts.
3213 340 : if (LCMin == UINT_MAX)
3214 : LCMin = LC;
3215 :
3216 : unsigned numAdded = 0;
3217 : if (TargetRegisterInfo::isVirtualRegister(LC)) {
3218 : Prolog->addSuccessor(Epilog);
3219 170 : numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
3220 338 : } else if (j >= LCMin) {
3221 169 : Prolog->addSuccessor(Epilog);
3222 : Prolog->removeSuccessor(LastPro);
3223 : LastEpi->removeSuccessor(Epilog);
3224 : numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
3225 170 : removePhis(Epilog, LastEpi);
3226 : // Remove the blocks that are no longer referenced.
3227 : if (LastPro != LastEpi) {
3228 : LastEpi->clear();
3229 170 : LastEpi->eraseFromParent();
3230 129 : }
3231 258 : LastPro->clear();
3232 41 : LastPro->eraseFromParent();
3233 5 : } else {
3234 5 : numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
3235 5 : removePhis(Epilog, Prolog);
3236 10 : }
3237 5 : LastPro = Prolog;
3238 : LastEpi = Epilog;
3239 5 : for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
3240 : E = Prolog->instr_rend();
3241 1 : I != E && numAdded > 0; ++I, --numAdded)
3242 : updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3243 : }
3244 5 : }
3245 :
3246 72 : /// Return true if we can compute the amount the instruction changes
3247 36 : /// during each iteration. Set Delta to the amount of the change.
3248 : bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3249 : const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3250 : unsigned BaseReg;
3251 299 : int64_t Offset;
3252 : if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3253 469 : return false;
3254 299 :
3255 : MachineRegisterInfo &MRI = MF.getRegInfo();
3256 125 : // Check if there is a Phi. If so, get the definition in the loop.
3257 : MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3258 : if (BaseDef && BaseDef->isPHI()) {
3259 : BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3260 0 : BaseDef = MRI.getVRegDef(BaseReg);
3261 0 : }
3262 : if (!BaseDef)
3263 : return false;
3264 0 :
3265 0 : int D = 0;
3266 : if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
3267 0 : return false;
3268 :
3269 0 : Delta = D;
3270 0 : return true;
3271 0 : }
3272 0 :
3273 : /// Update the memory operand with a new offset when the pipeliner
3274 0 : /// generates a new copy of the instruction that refers to a
3275 0 : /// different memory location.
3276 : void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3277 0 : MachineInstr &OldMI, unsigned Num) {
3278 0 : if (Num == 0)
3279 0 : return;
3280 : // If the instruction has memory operands, then adjust the offset
3281 0 : // when the instruction appears in different stages.
3282 0 : if (NewMI.memoperands_empty())
3283 : return;
3284 : SmallVector<MachineMemOperand *, 2> NewMMOs;
3285 : for (MachineMemOperand *MMO : NewMI.memoperands()) {
3286 : if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
3287 : (!MMO->getValue())) {
3288 2406 : NewMMOs.push_back(MMO);
3289 : continue;
3290 2406 : }
3291 1956 : unsigned Delta;
3292 : if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
3293 : int64_t AdjOffset = Delta * Num;
3294 1419 : NewMMOs.push_back(
3295 : MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
3296 : } else {
3297 930 : NewMMOs.push_back(
3298 1440 : MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
3299 : }
3300 0 : }
3301 0 : NewMI.setMemRefs(MF, NewMMOs);
3302 : }
3303 :
3304 480 : /// Clone the instruction for the new pipelined loop and update the
3305 192 : /// memory operands, if needed.
3306 192 : MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3307 192 : unsigned CurStageNum,
3308 : unsigned InstStageNum) {
3309 288 : MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3310 288 : // Check for tied operands in inline asm instructions. This should be handled
3311 : // elsewhere, but I'm not sure of the best solution.
3312 : if (OldMI->isInlineAsm())
3313 450 : for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
3314 : const auto &MO = OldMI->getOperand(i);
3315 : if (MO.isReg() && MO.isUse())
3316 : break;
3317 : unsigned UseIdx;
3318 1630 : if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3319 : NewMI->tieOperands(i, UseIdx);
3320 : }
3321 1630 : updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3322 : return NewMI;
3323 : }
3324 1630 :
3325 0 : /// Clone the instruction for the new pipelined loop. If needed, this
3326 0 : /// function updates the instruction using the values saved in the
3327 0 : /// InstrChanges structure.
3328 : MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3329 : unsigned CurStageNum,
3330 0 : unsigned InstStageNum,
3331 0 : SMSchedule &Schedule) {
3332 : MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3333 1630 : DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3334 1630 : InstrChanges.find(getSUnit(OldMI));
3335 : if (It != InstrChanges.end()) {
3336 : std::pair<unsigned, int64_t> RegAndOffset = It->second;
3337 : unsigned BasePos, OffsetPos;
3338 : if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
3339 : return nullptr;
3340 776 : int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3341 : MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3342 : if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3343 : NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3344 776 : NewMI->getOperand(OffsetPos).setImm(NewOffset);
3345 : }
3346 1552 : updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3347 776 : return NewMI;
3348 26 : }
3349 :
3350 26 : /// Update the machine instruction with new virtual registers. This
3351 0 : /// function may change the defintions and/or uses.
3352 26 : void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3353 26 : unsigned CurStageNum,
3354 26 : unsigned InstrStageNum,
3355 26 : SMSchedule &Schedule,
3356 26 : ValueMapTy *VRMap) {
3357 : for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
3358 776 : MachineOperand &MO = NewMI->getOperand(i);
3359 776 : if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3360 : continue;
3361 : unsigned reg = MO.getReg();
3362 : if (MO.isDef()) {
3363 : // Create a new virtual register for the definition.
3364 2955 : const TargetRegisterClass *RC = MRI.getRegClass(reg);
3365 : unsigned NewReg = MRI.createVirtualRegister(RC);
3366 : MO.setReg(NewReg);
3367 : VRMap[CurStageNum][reg] = NewReg;
3368 : if (LastDef)
3369 12744 : replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3370 9789 : } else if (MO.isUse()) {
3371 9789 : MachineInstr *Def = MRI.getVRegDef(reg);
3372 3091 : // Compute the stage that contains the last definition for instruction.
3373 6698 : int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3374 6698 : unsigned StageNum = CurStageNum;
3375 : if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
3376 2420 : // Compute the difference in stages between the defintion and the use.
3377 2420 : unsigned StageDiff = (InstrStageNum - DefStageNum);
3378 2420 : // Make an adjustment to get the last definition.
3379 2420 : StageNum -= StageDiff;
3380 2420 : }
3381 471 : if (VRMap[StageNum].count(reg))
3382 : MO.setReg(VRMap[StageNum][reg]);
3383 4278 : }
3384 : }
3385 4278 : }
3386 :
3387 4278 : /// Return the instruction in the loop that defines the register.
3388 : /// If the definition is a Phi, then follow the Phi operand to
3389 466 : /// the instruction in the loop.
3390 : MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3391 466 : SmallPtrSet<MachineInstr *, 8> Visited;
3392 : MachineInstr *Def = MRI.getVRegDef(Reg);
3393 4278 : while (Def->isPHI()) {
3394 1938 : if (!Visited.insert(Def).second)
3395 : break;
3396 : for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3397 2955 : if (Def->getOperand(i + 1).getMBB() == BB) {
3398 : Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3399 : break;
3400 : }
3401 : }
3402 0 : return Def;
3403 : }
3404 0 :
3405 : /// Return the new name for the value from the previous stage.
3406 0 : unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3407 : unsigned LoopVal, unsigned LoopStage,
3408 0 : ValueMapTy *VRMap,
3409 0 : MachineBasicBlock *BB) {
3410 0 : unsigned PrevVal = 0;
3411 0 : if (StageNum > PhiStage) {
3412 : MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3413 : if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3414 0 : // The name is defined in the previous stage.
3415 : PrevVal = VRMap[StageNum - 1][LoopVal];
3416 : else if (VRMap[StageNum].count(LoopVal))
3417 : // The previous name is defined in the current stage when the instruction
3418 412 : // order is swapped.
3419 : PrevVal = VRMap[StageNum][LoopVal];
3420 : else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
3421 : // The loop value hasn't yet been scheduled.
3422 : PrevVal = LoopVal;
3423 412 : else if (StageNum == PhiStage + 1)
3424 69 : // The loop value is another phi, which has not been scheduled.
3425 69 : PrevVal = getInitPhiReg(*LoopInst, BB);
3426 : else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3427 52 : // The loop value is another phi, which has been scheduled.
3428 17 : PrevVal =
3429 : getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3430 : LoopStage, VRMap, BB);
3431 7 : }
3432 10 : return PrevVal;
3433 : }
3434 0 :
3435 10 : /// Rewrite the Phi values in the specified block to use the mappings
3436 : /// from the initial operand. Once the Phi is scheduled, we switch
3437 : /// to using the loop value instead of the Phi value, so those names
3438 0 : /// do not need to be rewritten.
3439 : void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3440 : unsigned StageNum,
3441 0 : SMSchedule &Schedule,
3442 : ValueMapTy *VRMap,
3443 : InstrMapTy &InstrMap) {
3444 412 : for (auto &PHI : BB->phis()) {
3445 : unsigned InitVal = 0;
3446 : unsigned LoopVal = 0;
3447 : getPhiRegs(PHI, BB, InitVal, LoopVal);
3448 : unsigned PhiDef = PHI.getOperand(0).getReg();
3449 :
3450 : unsigned PhiStage =
3451 170 : (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3452 : unsigned LoopStage =
3453 : (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3454 : unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3455 : if (NumPhis > StageNum)
3456 730 : NumPhis = StageNum;
3457 : for (unsigned np = 0; np <= NumPhis; ++np) {
3458 : unsigned NewVal =
3459 390 : getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3460 390 : if (!NewVal)
3461 : NewVal = InitVal;
3462 : rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
3463 390 : PhiDef, NewVal);
3464 : }
3465 780 : }
3466 : }
3467 390 :
3468 : /// Rewrite a previously scheduled instruction to use the register value
3469 802 : /// from the new instruction. Make sure the instruction occurs in the
3470 : /// basic block, and we don't change the uses in the new instruction.
3471 412 : void SwingSchedulerDAG::rewriteScheduledInstr(
3472 412 : MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3473 : unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3474 412 : unsigned NewReg, unsigned PrevReg) {
3475 : bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3476 : int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3477 : // Rewrite uses that have been scheduled already to use the new
3478 170 : // Phi register.
3479 : for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3480 : EI = MRI.use_end();
3481 : UI != EI;) {
3482 : MachineOperand &UseOp = *UI;
3483 2124 : MachineInstr *UseMI = UseOp.getParent();
3484 : ++UI;
3485 : if (UseMI->getParent() != BB)
3486 : continue;
3487 2124 : if (UseMI->isPHI()) {
3488 2124 : if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3489 : continue;
3490 : if (getLoopPhiReg(*UseMI, BB) != OldReg)
3491 2124 : continue;
3492 : }
3493 7328 : InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3494 : assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3495 5204 : SUnit *OrigMISU = getSUnit(OrigInstr->second);
3496 : int StageSched = Schedule.stageScheduled(OrigMISU);
3497 5204 : int CycleSched = Schedule.cycleScheduled(OrigMISU);
3498 3346 : unsigned ReplaceReg = 0;
3499 : // This is the stage for the scheduled instruction.
3500 598 : if (StagePhi == StageSched && Phi->isPHI()) {
3501 : int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3502 86 : if (PrevReg && InProlog)
3503 : ReplaceReg = PrevReg;
3504 : else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3505 1858 : (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3506 : ReplaceReg = PrevReg;
3507 1858 : else
3508 1858 : ReplaceReg = NewReg;
3509 1858 : }
3510 : // The scheduled instruction occurs before the scheduled Phi, and the
3511 : // Phi is not loop carried.
3512 1858 : if (!InProlog && StagePhi + 1 == StageSched &&
3513 977 : !Schedule.isLoopCarried(this, *Phi))
3514 977 : ReplaceReg = NewReg;
3515 : if (StagePhi > StageSched && Phi->isPHI())
3516 977 : ReplaceReg = NewReg;
3517 0 : if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3518 : ReplaceReg = NewReg;
3519 : if (ReplaceReg) {
3520 : MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3521 : UseOp.setReg(ReplaceReg);
3522 : }
3523 : }
3524 2594 : }
3525 736 :
3526 : /// Check if we can change the instruction to use an offset value from the
3527 1858 : /// previous iteration. If so, return true and set the base and offset values
3528 : /// so that we can rewrite the load, if necessary.
3529 2559 : /// v1 = Phi(v0, v3)
3530 : /// v2 = load v1, 0
3531 1858 : /// v3 = post_store v1, 4, x
3532 3392 : /// This function enables the load to be rewritten as v2 = load v3, 4.
3533 1696 : bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3534 : unsigned &BasePos,
3535 : unsigned &OffsetPos,
3536 2124 : unsigned &NewBase,
3537 : int64_t &Offset) {
3538 : // Get the load instruction.
3539 : if (TII->isPostIncrement(*MI))
3540 : return false;
3541 : unsigned BasePosLd, OffsetPosLd;
3542 : if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3543 : return false;
3544 : unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3545 0 :
3546 : // Look for the Phi instruction.
3547 : MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3548 : MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3549 : if (!Phi || !Phi->isPHI())
3550 : return false;
3551 0 : // Get the register defined in the loop block.
3552 0 : unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3553 : if (!PrevReg)
3554 0 : return false;
3555 0 :
3556 0 : // Check for the post-increment load/store instruction.
3557 : MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3558 : if (!PrevDef || PrevDef == MI)
3559 0 : return false;
3560 0 :
3561 0 : if (!TII->isPostIncrement(*PrevDef))
3562 0 : return false;
3563 :
3564 0 : unsigned BasePos1 = 0, OffsetPos1 = 0;
3565 0 : if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3566 0 : return false;
3567 :
3568 : // Make sure that the instructions do not access the same memory location in
3569 0 : // the next iteration.
3570 0 : int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3571 0 : int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3572 : MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3573 0 : NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3574 0 : bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3575 : MF.DeleteMachineInstr(NewMI);
3576 0 : if (!Disjoint)
3577 0 : return false;
3578 0 :
3579 : // Set the return value once we determine that we return true.
3580 : BasePos = BasePosLd;
3581 : OffsetPos = OffsetPosLd;
3582 0 : NewBase = PrevReg;
3583 0 : Offset = StoreOffset;
3584 0 : return true;
3585 0 : }
3586 0 :
3587 0 : /// Apply changes to the instruction if needed. The changes are need
3588 0 : /// to improve the scheduling and depend up on the final schedule.
3589 0 : void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3590 : SMSchedule &Schedule) {
3591 : SUnit *SU = getSUnit(MI);
3592 0 : DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3593 0 : InstrChanges.find(SU);
3594 0 : if (It != InstrChanges.end()) {
3595 0 : std::pair<unsigned, int64_t> RegAndOffset = It->second;
3596 0 : unsigned BasePos, OffsetPos;
3597 : if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3598 : return;
3599 : unsigned BaseReg = MI->getOperand(BasePos).getReg();
3600 : MachineInstr *LoopDef = findDefInLoop(BaseReg);
3601 1633 : int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3602 : int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3603 : int BaseStageNum = Schedule.stageScheduled(SU);
3604 : int BaseCycleNum = Schedule.cycleScheduled(SU);
3605 1633 : if (BaseStageNum < DefStageNum) {
3606 1633 : MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3607 25 : int OffsetDiff = DefStageNum - BaseStageNum;
3608 : if (DefCycleNum < BaseCycleNum) {
3609 25 : NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3610 0 : if (OffsetDiff > 0)
3611 25 : --OffsetDiff;
3612 25 : }
3613 25 : int64_t NewOffset =
3614 25 : MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3615 25 : NewMI->getOperand(OffsetPos).setImm(NewOffset);
3616 25 : SU->setInstr(NewMI);
3617 25 : MISUnitMap[NewMI] = SU;
3618 19 : NewMIs.insert(NewMI);
3619 19 : }
3620 19 : }
3621 4 : }
3622 :
3623 2 : /// Return true for an order or output dependence that is loop carried
3624 : /// potentially. A dependence is loop carried if the destination defines a valu
3625 : /// that may be used or defined by the source in a subsequent iteration.
3626 19 : bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
3627 19 : bool isSucc) {
3628 : if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
3629 19 : Dep.isArtificial())
3630 19 : return false;
3631 :
3632 : if (!SwpPruneLoopCarried)
3633 : return true;
3634 :
3635 : if (Dep.getKind() == SDep::Output)
3636 : return true;
3637 :
3638 4214 : MachineInstr *SI = Source->getInstr();
3639 : MachineInstr *DI = Dep.getSUnit()->getInstr();
3640 4214 : if (!isSucc)
3641 : std::swap(SI, DI);
3642 : assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3643 :
3644 712 : // Assume ordered loads and stores may have a loop carried dependence.
3645 : if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3646 : SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3647 710 : return true;
3648 :
3649 : // Only chain dependences between a load and store can be loop carried.
3650 710 : if (!DI->mayStore() || !SI->mayLoad())
3651 710 : return false;
3652 710 :
3653 : unsigned DeltaS, DeltaD;
3654 : if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3655 : return true;
3656 :
3657 2124 : unsigned BaseRegS, BaseRegD;
3658 2124 : int64_t OffsetS, OffsetD;
3659 3 : const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3660 : if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
3661 : !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
3662 707 : return true;
3663 104 :
3664 : if (BaseRegS != BaseRegD)
3665 : return true;
3666 603 :
3667 218 : // Check that the base register is incremented by a constant value for each
3668 : // iteration.
3669 : MachineInstr *Def = MRI.getVRegDef(BaseRegS);
3670 : if (!Def || !Def->isPHI())
3671 385 : return true;
3672 770 : unsigned InitVal = 0;
3673 385 : unsigned LoopVal = 0;
3674 0 : getPhiRegs(*Def, BB, InitVal, LoopVal);
3675 : MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
3676 385 : int D = 0;
3677 : if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
3678 : return true;
3679 :
3680 : uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3681 118 : uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3682 118 :
3683 : // This is the main test, which checks the offset values and the loop
3684 : // increment value to determine if the accesses may be loop carried.
3685 : if (OffsetS >= OffsetD)
3686 118 : return OffsetS + AccessSizeS > DeltaS;
3687 118 : else
3688 118 : return OffsetD + AccessSizeD > DeltaD;
3689 118 :
3690 0 : return true;
3691 : }
3692 118 :
3693 118 : void SwingSchedulerDAG::postprocessDAG() {
3694 : for (auto &M : Mutations)
3695 : M->apply(this);
3696 : }
3697 118 :
3698 72 : /// Try to schedule the node at the specified StartCycle and continue
3699 : /// until the node is schedule or the EndCycle is reached. This function
3700 46 : /// returns true if the node is scheduled. This routine may search either
3701 : /// forward or backward for a place to insert the instruction based upon
3702 : /// the relative values of StartCycle and EndCycle.
3703 : bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3704 : bool forward = true;
3705 : if (StartCycle > EndCycle)
3706 760 : forward = false;
3707 570 :
3708 : // The terminating condition depends on the direction.
3709 : int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3710 : for (int curCycle = StartCycle; curCycle != termCycle;
3711 : forward ? ++curCycle : --curCycle) {
3712 :
3713 : // Add the already scheduled instructions at the specified cycle to the DFA.
3714 : Resources->clearResources();
3715 2952 : for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3716 : checkCycle <= LastCycle; checkCycle += II) {
3717 2952 : std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3718 :
3719 : for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3720 : E = cycleInstrs.end();
3721 2952 : I != E; ++I) {
3722 4686 : if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3723 : continue;
3724 : assert(Resources->canReserveResources(*(*I)->getInstr()) &&
3725 : "These instructions have already been scheduled.");
3726 : Resources->reserveResources(*(*I)->getInstr());
3727 3807 : }
3728 8642 : }
3729 4835 : if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3730 : Resources->canReserveResources(*SU->getInstr())) {
3731 : LLVM_DEBUG({
3732 : dbgs() << "\tinsert at cycle " << curCycle << " ";
3733 10479 : SU->getInstr()->dump();
3734 5644 : });
3735 :
3736 : ScheduledInstrs[curCycle].push_back(SU);
3737 : InstrToCycle.insert(std::make_pair(SU, curCycle));
3738 4339 : if (curCycle > LastCycle)
3739 : LastCycle = curCycle;
3740 : if (curCycle < FirstCycle)
3741 7045 : FirstCycle = curCycle;
3742 3238 : return true;
3743 : }
3744 : LLVM_DEBUG({
3745 : dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3746 : SU->getInstr()->dump();
3747 : });
3748 2940 : }
3749 2940 : return false;
3750 2940 : }
3751 424 :
3752 2940 : // Return the cycle of the earliest scheduled instruction in the chain.
3753 110 : int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3754 2940 : SmallPtrSet<SUnit *, 8> Visited;
3755 : SmallVector<SDep, 8> Worklist;
3756 : Worklist.push_back(Dep);
3757 : int EarlyCycle = INT_MAX;
3758 : while (!Worklist.empty()) {
3759 : const SDep &Cur = Worklist.pop_back_val();
3760 : SUnit *PrevSU = Cur.getSUnit();
3761 12 : if (Visited.count(PrevSU))
3762 : continue;
3763 : std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3764 : if (it == InstrToCycle.end())
3765 16 : continue;
3766 : EarlyCycle = std::min(EarlyCycle, it->second);
3767 : for (const auto &PI : PrevSU->Preds)
3768 16 : if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3769 16 : Worklist.push_back(PI);
3770 33 : Visited.insert(PrevSU);
3771 17 : }
3772 17 : return EarlyCycle;
3773 17 : }
3774 1 :
3775 : // Return the cycle of the latest scheduled instruction in the chain.
3776 17 : int SMSchedule::latestCycleInChain(const SDep &Dep) {
3777 : SmallPtrSet<SUnit *, 8> Visited;
3778 16 : SmallVector<SDep, 8> Worklist;
3779 41 : Worklist.push_back(Dep);
3780 25 : int LateCycle = INT_MIN;
3781 1 : while (!Worklist.empty()) {
3782 16 : const SDep &Cur = Worklist.pop_back_val();
3783 : SUnit *SuccSU = Cur.getSUnit();
3784 16 : if (Visited.count(SuccSU))
3785 : continue;
3786 : std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3787 : if (it == InstrToCycle.end())
3788 356 : continue;
3789 : LateCycle = std::max(LateCycle, it->second);
3790 : for (const auto &SI : SuccSU->Succs)
3791 356 : if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3792 356 : Worklist.push_back(SI);
3793 1093 : Visited.insert(SuccSU);
3794 737 : }
3795 737 : return LateCycle;
3796 737 : }
3797 140 :
3798 : /// If an instruction has a use that spans multiple iterations, then
3799 633 : /// return true. These instructions are characterized by having a back-ege
3800 : /// to a Phi, which contains a reference to another Phi.
3801 597 : static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3802 1127 : for (auto &P : SU->Preds)
3803 530 : if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3804 381 : for (auto &S : P.getSUnit()->Succs)
3805 597 : if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3806 : return P.getSUnit();
3807 356 : return nullptr;
3808 : }
3809 :
3810 : /// Compute the scheduling start slot for the instruction. The start slot
3811 : /// depends on any predecessor or successor nodes scheduled already.
3812 : void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3813 0 : int *MinEnd, int *MaxStart, int II,
3814 0 : SwingSchedulerDAG *DAG) {
3815 0 : // Iterate over each instruction that has been scheduled already. The start
3816 0 : // slot computation depends on whether the previously scheduled instruction
3817 0 : // is a predecessor or successor of the specified instruction.
3818 : for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3819 :
3820 : // Iterate over each instruction in the current cycle.
3821 : for (SUnit *I : getInstructions(cycle)) {
3822 : // Because we're processing a DAG for the dependences, we recognize
3823 : // the back-edge in recurrences by anti dependences.
3824 3030 : for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3825 : const SDep &Dep = SU->Preds[i];
3826 : if (Dep.getSUnit() == I) {
3827 : if (!DAG->isBackedge(SU, Dep)) {
3828 : int EarlyStart = cycle + Dep.getLatency() -
3829 : DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3830 26824 : *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3831 : if (DAG->isLoopCarriedDep(SU, Dep, false)) {
3832 : int End = earliestCycleInChain(Dep) + (II - 1);
3833 73425 : *MinEnd = std::min(*MinEnd, End);
3834 : }
3835 : } else {
3836 63805 : int LateStart = cycle - Dep.getLatency() +
3837 37968 : DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3838 37968 : *MinLateStart = std::min(*MinLateStart, LateStart);
3839 1109 : }
3840 1063 : }
3841 1063 : // For instruction that requires multiple iterations, make sure that
3842 1063 : // the dependent instruction is not scheduled past the definition.
3843 1063 : SUnit *BE = multipleIterations(I, DAG);
3844 16 : if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3845 16 : !SU->isPred(I))
3846 : *MinLateStart = std::min(*MinLateStart, cycle);
3847 : }
3848 46 : for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
3849 46 : if (SU->Succs[i].getSUnit() == I) {
3850 46 : const SDep &Dep = SU->Succs[i];
3851 : if (!DAG->isBackedge(SU, Dep)) {
3852 : int LateStart = cycle - Dep.getLatency() +
3853 : DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3854 : *MinLateStart = std::min(*MinLateStart, LateStart);
3855 37968 : if (DAG->isLoopCarriedDep(SU, Dep)) {
3856 37977 : int Start = latestCycleInChain(Dep) + 1 - II;
3857 : *MaxStart = std::max(*MaxStart, Start);
3858 9 : }
3859 : } else {
3860 65665 : int EarlyStart = cycle + Dep.getLatency() -
3861 79656 : DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3862 : *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3863 3007 : }
3864 2641 : }
3865 2641 : }
3866 2641 : }
3867 2641 : }
3868 356 : }
3869 356 :
3870 : /// Order the instructions within a cycle so that the definitions occur
3871 : /// before the uses. Returns true if the instruction is added to the start
3872 366 : /// of the list, or false if added to the end.
3873 366 : void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3874 366 : std::deque<SUnit *> &Insts) {
3875 : MachineInstr *MI = SU->getInstr();
3876 : bool OrderBeforeUse = false;
3877 : bool OrderAfterDef = false;
3878 : bool OrderBeforeDef = false;
3879 : unsigned MoveDef = 0;
3880 3030 : unsigned MoveUse = 0;
3881 : int StageInst1 = stageScheduled(SU);
3882 :
3883 : unsigned Pos = 0;
3884 : for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3885 1398 : ++I, ++Pos) {
3886 : for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3887 1398 : MachineOperand &MO = MI->getOperand(i);
3888 : if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3889 : continue;
3890 :
3891 : unsigned Reg = MO.getReg();
3892 : unsigned BasePos, OffsetPos;
3893 1398 : if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3894 : if (MI->getOperand(BasePos).getReg() == Reg)
3895 : if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3896 2825 : Reg = NewReg;
3897 : bool Reads, Writes;
3898 6127 : std::tie(Reads, Writes) =
3899 4700 : (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3900 4700 : if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3901 1045 : OrderBeforeUse = true;
3902 : if (MoveUse == 0)
3903 : MoveUse = Pos;
3904 : } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3905 3655 : // Add the instruction after the scheduled instruction.
3906 1818 : OrderAfterDef = true;
3907 487 : MoveDef = Pos;
3908 : } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3909 : if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3910 : OrderBeforeUse = true;
3911 3655 : if (MoveUse == 0)
3912 3877 : MoveUse = Pos;
3913 : } else {
3914 97 : OrderAfterDef = true;
3915 : MoveDef = Pos;
3916 3683 : }
3917 : } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3918 : OrderBeforeUse = true;
3919 : if (MoveUse == 0)
3920 3552 : MoveUse = Pos;
3921 84 : if (MoveUse != 0) {
3922 : OrderAfterDef = true;
3923 0 : MoveDef = Pos - 1;
3924 : }
3925 : } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3926 : // Add the instruction before the scheduled instruction.
3927 : OrderBeforeUse = true;
3928 : if (MoveUse == 0)
3929 3468 : MoveUse = Pos;
3930 : } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3931 28 : isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3932 : if (MoveUse == 0) {
3933 28 : OrderBeforeDef = true;
3934 : MoveUse = Pos;
3935 12 : }
3936 : }
3937 3412 : }
3938 : // Check for order dependences between instructions. Make sure the source
3939 : // is ordered before the destination.
3940 49 : for (auto &S : SU->Succs) {
3941 : if (S.getSUnit() != *I)
3942 6635 : continue;
3943 1196 : if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3944 98 : OrderBeforeUse = true;
3945 : if (Pos < MoveUse)
3946 : MoveUse = Pos;
3947 : }
3948 : }
3949 : for (auto &P : SU->Preds) {
3950 : if (P.getSUnit() != *I)
3951 : continue;
3952 2993 : if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3953 1566 : OrderAfterDef = true;
3954 : MoveDef = Pos;
3955 266 : }
3956 : }
3957 6 : }
3958 :
3959 : // A circular dependence.
3960 : if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3961 3467 : OrderBeforeUse = false;
3962 2040 :
3963 : // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3964 99 : // to a loop-carried dependence.
3965 : if (OrderBeforeDef)
3966 : OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3967 :
3968 : // The uncommon case when the instruction order needs to be updated because
3969 : // there is both a use and def.
3970 : if (OrderBeforeUse && OrderAfterDef) {
3971 : SUnit *UseSU = Insts.at(MoveUse);
3972 1398 : SUnit *DefSU = Insts.at(MoveDef);
3973 : if (MoveUse > MoveDef) {
3974 : Insts.erase(Insts.begin() + MoveUse);
3975 : Insts.erase(Insts.begin() + MoveDef);
3976 : } else {
3977 1398 : Insts.erase(Insts.begin() + MoveDef);
3978 97 : Insts.erase(Insts.begin() + MoveUse);
3979 : }
3980 : orderDependence(SSD, UseSU, Insts);
3981 : orderDependence(SSD, SU, Insts);
3982 1398 : orderDependence(SSD, DefSU, Insts);
3983 47 : return;
3984 47 : }
3985 47 : // Put the new instruction first if there is a use in the list. Otherwise,
3986 86 : // put it at the end of the list.
3987 129 : if (OrderBeforeUse)
3988 : Insts.push_front(SU);
3989 8 : else
3990 12 : Insts.push_back(SU);
3991 : }
3992 47 :
3993 47 : /// Return true if the scheduled Phi has a loop carried operand.
3994 47 : bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3995 47 : if (!Phi.isPHI())
3996 : return false;
3997 : assert(Phi.isPHI() && "Expecting a Phi.");
3998 : SUnit *DefSU = SSD->getSUnit(&Phi);
3999 1351 : unsigned DefCycle = cycleScheduled(DefSU);
4000 : int DefStage = stageScheduled(DefSU);
4001 :
4002 : unsigned InitVal = 0;
4003 : unsigned LoopVal = 0;
4004 : getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
4005 : SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
4006 2149 : if (!UseSU)
4007 : return true;
4008 : if (UseSU->getInstr()->isPHI())
4009 : return true;
4010 : unsigned LoopCycle = cycleScheduled(UseSU);
4011 1517 : int LoopStage = stageScheduled(UseSU);
4012 1517 : return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
4013 : }
4014 :
4015 : /// Return true if the instruction is a definition that is loop carried
4016 1517 : /// and defines the use on the next iteration.
4017 1517 : /// v1 = phi(v2, v3)
4018 1515 : /// (Def) v3 = op v1
4019 2 : /// (MO) = v1
4020 1515 : /// If MO appears before Def, then then v1 and v3 may get assigned to the same
4021 : /// register.
4022 1469 : bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
4023 1469 : MachineInstr *Def, MachineOperand &MO) {
4024 1469 : if (!MO.isReg())
4025 : return false;
4026 : if (Def->isPHI())
4027 : return false;
4028 : MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
4029 : if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
4030 : return false;
4031 : if (!isLoopCarried(SSD, *Phi))
4032 : return false;
4033 : unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
4034 1196 : for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
4035 : MachineOperand &DMO = Def->getOperand(i);
4036 1196 : if (!DMO.isReg() || !DMO.isDef())
4037 : continue;
4038 : if (DMO.getReg() == LoopReg)
4039 : return true;
4040 1196 : }
4041 1561 : return false;
4042 : }
4043 348 :
4044 : // Check if the generated schedule is valid. This function checks if
4045 328 : // an instruction that uses a physical register is scheduled in a
4046 1093 : // different stage than the definition. The pipeliner does not handle
4047 863 : // physical register values that may cross a basic block boundary.
4048 863 : bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
4049 : for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
4050 327 : SUnit &SU = SSD->SUnits[i];
4051 : if (!SU.hasPhysRegDefs)
4052 : continue;
4053 : int StageDef = stageScheduled(&SU);
4054 : assert(StageDef != -1 && "Instruction should have been scheduled.");
4055 : for (auto &SI : SU.Succs)
4056 : if (SI.isAssignedRegDep())
4057 : if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
4058 : if (stageScheduled(SI.getSUnit()) != StageDef)
4059 : return false;
4060 175 : }
4061 1983 : return true;
4062 1633 : }
4063 1633 :
4064 : /// A property of the node order in swing-modulo-scheduling is
4065 0 : /// that for nodes outside circuits the following holds:
4066 : /// none of them is scheduled after both a successor and a
4067 0 : /// predecessor.
4068 : /// The method below checks whether the property is met.
4069 0 : /// If not, debug information is printed and statistics information updated.
4070 0 : /// Note that we do not use an assert statement.
4071 : /// The reason is that although an invalid node oder may prevent
4072 : /// the pipeliner from finding a pipelined schedule for arbitrary II,
4073 : /// it does not lead to the generation of incorrect code.
4074 : void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
4075 :
4076 : // a sorted vector that maps each SUnit to its index in the NodeOrder
4077 : typedef std::pair<SUnit *, unsigned> UnitIndex;
4078 : std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
4079 :
4080 : for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
4081 : Indices.push_back(std::make_pair(NodeOrder[i], i));
4082 :
4083 : auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
4084 : return std::get<0>(i1) < std::get<0>(i2);
4085 : };
4086 188 :
4087 : // sort, so that we can perform a binary search
4088 : llvm::sort(Indices, CompareKey);
4089 :
4090 188 : bool Valid = true;
4091 : (void)Valid;
4092 2033 : // for each SUnit in the NodeOrder, check whether
4093 3690 : // it appears after both a successor and a predecessor
4094 : // of the SUnit. If this is the case, and the SUnit
4095 : // is not part of circuit, then the NodeOrder is not
4096 0 : // valid.
4097 : for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
4098 : SUnit *SU = NodeOrder[i];
4099 : unsigned Index = i;
4100 :
4101 : bool PredBefore = false;
4102 : bool SuccBefore = false;
4103 :
4104 : SUnit *Succ;
4105 : SUnit *Pred;
4106 : (void)Succ;
4107 : (void)Pred;
4108 :
4109 2033 : for (SDep &PredEdge : SU->Preds) {
4110 3690 : SUnit *PredSU = PredEdge.getSUnit();
4111 : unsigned PredIndex =
4112 : std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
4113 : std::make_pair(PredSU, 0), CompareKey));
4114 : if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
4115 : PredBefore = true;
4116 : Pred = PredSU;
4117 : break;
4118 : }
4119 : }
4120 :
4121 3761 : for (SDep &SuccEdge : SU->Succs) {
4122 : SUnit *SuccSU = SuccEdge.getSUnit();
4123 : unsigned SuccIndex =
4124 : std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
4125 2256 : std::make_pair(SuccSU, 0), CompareKey));
4126 3572 : if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
4127 : SuccBefore = true;
4128 : Succ = SuccSU;
4129 : break;
4130 : }
4131 : }
4132 :
4133 2420 : if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
4134 : // instructions in circuits are allowed to be scheduled
4135 : // after both a successor and predecessor.
4136 : bool InCircuit = std::any_of(
4137 1703 : Circuits.begin(), Circuits.end(),
4138 3396 : [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
4139 : if (InCircuit)
4140 : LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
4141 : else {
4142 : Valid = false;
4143 : NumNodeOrderIssues++;
4144 : LLVM_DEBUG(dbgs() << "Predecessor ";);
4145 1845 : }
4146 : LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
4147 : << " are scheduled before node " << SU->NodeNum
4148 : << "\n";);
4149 : }
4150 : }
4151 :
4152 : LLVM_DEBUG({
4153 : if (!Valid)
4154 : dbgs() << "Invalid node order found!\n";
4155 : });
4156 : }
4157 :
4158 : /// Attempt to fix the degenerate cases when the instruction serialization
4159 : /// causes the register lifetimes to overlap. For example,
4160 : /// p' = store_pi(p, b)
4161 : /// = load p, offset
4162 : /// In this case p and p' overlap, which means that two registers are needed.
4163 : /// Instead, this function changes the load to use p' and updates the offset.
4164 : void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
4165 : unsigned OverlapReg = 0;
4166 : unsigned NewBaseReg = 0;
4167 : for (SUnit *SU : Instrs) {
4168 188 : MachineInstr *MI = SU->getInstr();
4169 : for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
4170 : const MachineOperand &MO = MI->getOperand(i);
4171 : // Look for an instruction that uses p. The instruction occurs in the
4172 : // same cycle but occurs later in the serialized order.
4173 : if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
4174 : // Check that the instruction appears in the InstrChanges structure,
4175 : // which contains instructions that can have the offset updated.
4176 500 : DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
4177 : InstrChanges.find(SU);
4178 : if (It != InstrChanges.end()) {
4179 2133 : unsigned BasePos, OffsetPos;
4180 1633 : // Update the base register and adjust the offset.
4181 6817 : if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
4182 5434 : MachineInstr *NewMI = MF.CloneMachineInstr(MI);
4183 : NewMI->getOperand(BasePos).setReg(NewBaseReg);
4184 : int64_t NewOffset =
4185 5434 : MI->getOperand(OffsetPos).getImm() - It->second.second;
4186 : NewMI->getOperand(OffsetPos).setImm(NewOffset);
4187 : SU->setInstr(NewMI);
4188 : MISUnitMap[NewMI] = SU;
4189 2 : NewMIs.insert(NewMI);
4190 2 : }
4191 : }
4192 : OverlapReg = 0;
4193 2 : NewBaseReg = 0;
4194 2 : break;
4195 4 : }
4196 : // Look for an instruction of the form p' = op(p), which uses and defines
4197 2 : // two virtual registers that get allocated to the same physical register.
4198 2 : unsigned TiedUseIdx = 0;
4199 : if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
4200 2 : // OverlapReg is p in the example above.
4201 2 : OverlapReg = MI->getOperand(TiedUseIdx).getReg();
4202 : // NewBaseReg is p' in the example above.
4203 : NewBaseReg = MI->getOperand(i).getReg();
4204 : break;
4205 : }
4206 : }
4207 : }
4208 : }
4209 :
4210 5432 : /// After the schedule has been formed, call this function to combine
4211 5432 : /// the instructions from the different stages/cycles. That is, this
4212 : /// function creates a schedule that represents a single iteration.
4213 496 : void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
4214 : // Move all instructions to the first stage from later stages.
4215 248 : for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
4216 248 : for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
4217 : ++stage) {
4218 : std::deque<SUnit *> &cycleInstrs =
4219 : ScheduledInstrs[cycle + (stage * InitiationInterval)];
4220 500 : for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
4221 : E = cycleInstrs.rend();
4222 : I != E; ++I)
4223 : ScheduledInstrs[cycle].push_front(*I);
4224 : }
4225 175 : }
4226 : // Iterate over the definitions in each instruction, and compute the
4227 675 : // stage difference for each use. Keep the maximum value.
4228 983 : for (auto &I : InstrToCycle) {
4229 : int DefStage = stageScheduled(I.first);
4230 : MachineInstr *MI = I.first->getInstr();
4231 483 : for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
4232 : MachineOperand &Op = MI->getOperand(i);
4233 : if (!Op.isReg() || !Op.isDef())
4234 1107 : continue;
4235 :
4236 : unsigned Reg = Op.getReg();
4237 : unsigned MaxDiff = 0;
4238 : bool PhiIsSwapped = false;
4239 : for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
4240 1808 : EI = MRI.use_end();
4241 1633 : UI != EI; ++UI) {
4242 1633 : MachineOperand &UseOp = *UI;
4243 7733 : MachineInstr *UseMI = UseOp.getParent();
4244 6100 : SUnit *SUnitUse = SSD->getSUnit(UseMI);
4245 6100 : int UseStage = stageScheduled(SUnitUse);
4246 4476 : unsigned Diff = 0;
4247 : if (UseStage != -1 && UseStage >= DefStage)
4248 1624 : Diff = UseStage - DefStage;
4249 1624 : if (MI->isPHI()) {
4250 : if (isLoopCarried(SSD, *MI))
4251 1624 : ++Diff;
4252 : else
4253 3789 : PhiIsSwapped = true;
4254 : }
4255 2165 : MaxDiff = std::max(Diff, MaxDiff);
4256 : }
4257 2165 : RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
4258 2165 : }
4259 2165 : }
4260 2058 :
4261 : // Erase all the elements in the later stages. Only one iteration should
4262 659 : // remain in the scheduled list, and it contains all the instructions.
4263 632 : for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
4264 : ScheduledInstrs.erase(cycle);
4265 :
4266 : // Change the registers in instruction as specified in the InstrChanges
4267 2165 : // map. We need to use the new registers to create the correct order.
4268 : for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
4269 1624 : SUnit *SU = &SSD->SUnits[i];
4270 : SSD->applyInstrChange(SU->getInstr(), *this);
4271 : }
4272 :
4273 : // Reorder the instructions in each cycle to fix and improve the
4274 : // generated code.
4275 591 : for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
4276 416 : std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
4277 : std::deque<SUnit *> newOrderPhi;
4278 : for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
4279 : SUnit *SU = cycleInstrs[i];
4280 1983 : if (SU->getInstr()->isPHI())
4281 1633 : newOrderPhi.push_back(SU);
4282 1633 : }
4283 : std::deque<SUnit *> newOrderI;
4284 : for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
4285 : SUnit *SU = cycleInstrs[i];
4286 : if (!SU->getInstr()->isPHI())
4287 675 : orderDependence(SSD, SU, newOrderI);
4288 500 : }
4289 : // Replace the old order with the new order.
4290 2133 : cycleInstrs.swap(newOrderPhi);
4291 1633 : cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
4292 1633 : SSD->fixupRegisterOverlaps(cycleInstrs);
4293 : }
4294 :
4295 : LLVM_DEBUG(dump(););
4296 2133 : }
4297 1633 :
4298 1633 : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4299 1257 : /// Print the schedule information to the given output.
4300 : void SMSchedule::print(raw_ostream &os) const {
4301 : // Iterate over each cycle.
4302 : for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
4303 500 : // Iterate over each instruction in the cycle.
4304 500 : const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
4305 : for (SUnit *CI : cycleInstrs->second) {
4306 : os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
4307 : os << "(" << CI->NodeNum << ") ";
4308 175 : CI->getInstr()->print(os);
4309 : os << "\n";
4310 : }
4311 : }
4312 : }
4313 :
4314 : /// Utility function used for debugging to print the schedule.
4315 : LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
4316 : #endif
|