LLVM 23.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197/// A command line argument to limit the number of store instructions in the
198/// target basic block.
200 "pipeliner-max-num-stores",
201 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
202 cl::init(200));
203
204// A command line option to enable the CopyToPhi DAG mutation.
206 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
207 cl::init(true),
208 cl::desc("Enable CopyToPhi DAG Mutation"));
209
210/// A command line argument to force pipeliner to use specified issue
211/// width.
213 "pipeliner-force-issue-width",
214 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
215 cl::init(-1));
216
217/// A command line argument to set the window scheduling option.
220 cl::desc("Set how to use window scheduling algorithm."),
222 "Turn off window algorithm."),
224 "Use window algorithm after SMS algorithm fails."),
226 "Use window algorithm instead of SMS algorithm.")));
227
228unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
229char MachinePipeliner::ID = 0;
230#ifndef NDEBUG
232#endif
234
236 "Modulo Software Pipelining", false, false)
242 "Modulo Software Pipelining", false, false)
243
244namespace {
245
246/// This class holds an SUnit corresponding to a memory operation and other
247/// information related to the instruction.
251
252 /// The value of a memory operand.
253 const Value *MemOpValue = nullptr;
254
255 /// The offset of a memory operand.
256 int64_t MemOpOffset = 0;
257
259
260 /// True if all the underlying objects are identified.
261 bool IsAllIdentified = false;
262
264
265 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
266
267 bool isUnknown() const { return MemOpValue == nullptr; }
268
269private:
271};
272
273/// Add loop-carried chain dependencies. This class handles the same type of
274/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
275/// account dependencies across iterations.
277 // Type of instruction that is relevant to order-dependencies
278 enum class InstrTag {
279 Barrier = 0, ///< A barrier event instruction.
280 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
281 ///< not a barrier event.
282 FPExceptions = 2, ///< An instruction that does not match above, but may
283 ///< raise floatin-point exceptions.
284 };
285
286 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
287 TaggedSUnit(SUnit *SU, InstrTag Tag)
288 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
289
290 InstrTag getTag() const { return InstrTag(getInt()); }
291 };
292
293 /// Holds loads and stores with memory related information.
294 struct LoadStoreChunk {
297
298 void append(SUnit *SU);
299 };
300
302 BatchAAResults *BAA;
303 std::vector<SUnit> &SUnits;
304
305 /// The size of SUnits, for convenience.
306 const unsigned N;
307
308 /// Loop-carried Edges.
309 std::vector<BitVector> LoopCarried;
310
311 /// Instructions related to chain dependencies. They are one of the
312 /// following:
313 ///
314 /// 1. Barrier event.
315 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
316 /// value.
317 /// 3. Store, but not a barrier event.
318 /// 4. None of them, but may raise floating-point exceptions.
319 ///
320 /// This is used when analyzing loop-carried dependencies that access global
321 /// barrier instructions.
322 std::vector<TaggedSUnit> TaggedSUnits;
323
324 const TargetInstrInfo *TII = nullptr;
325 const TargetRegisterInfo *TRI = nullptr;
326
327public:
329 const TargetInstrInfo *TII,
330 const TargetRegisterInfo *TRI);
331
332 /// The main function to compute loop-carried order-dependencies.
333 void computeDependencies();
334
335 const BitVector &getLoopCarried(unsigned Idx) const {
336 return LoopCarried[Idx];
337 }
338
339private:
340 /// Tags to \p SU if the instruction may affect the order-dependencies.
341 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
342
343 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
344 const LoadStoreChunk &To);
345
346 /// Add a loop-carried order dependency between \p Src and \p Dst if we
347 /// cannot prove they are independent.
348 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
349 const SUnitWithMemInfo &Dst);
350
351 void computeDependenciesAux();
352};
353
354} // end anonymous namespace
355
356/// The "main" function for implementing Swing Modulo Scheduling.
358 if (skipFunction(mf.getFunction()))
359 return false;
360
361 if (!EnableSWP)
362 return false;
363
364 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
365 !EnableSWPOptSize.getPosition())
366 return false;
367
369 return false;
370
371 // Cannot pipeline loops without instruction itineraries if we are using
372 // DFA for the pipeliner.
373 if (mf.getSubtarget().useDFAforSMS() &&
376 return false;
377
378 MF = &mf;
382 TII = MF->getSubtarget().getInstrInfo();
383 RegClassInfo.runOnMachineFunction(*MF);
384
385 for (const auto &L : *MLI)
386 scheduleLoop(*L);
387
388 return false;
389}
390
391/// Attempt to perform the SMS algorithm on the specified loop. This function is
392/// the main entry point for the algorithm. The function identifies candidate
393/// loops, calculates the minimum initiation interval, and attempts to schedule
394/// the loop.
395bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
396 bool Changed = false;
397 for (const auto &InnerLoop : L)
398 Changed |= scheduleLoop(*InnerLoop);
399
400#ifndef NDEBUG
401 // Stop trying after reaching the limit (if any).
402 int Limit = SwpLoopLimit;
403 if (Limit >= 0) {
404 if (NumTries >= SwpLoopLimit)
405 return Changed;
406 NumTries++;
407 }
408#endif
409
410 setPragmaPipelineOptions(L);
411 if (!canPipelineLoop(L)) {
412 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
413 ORE->emit([&]() {
414 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
415 L.getStartLoc(), L.getHeader())
416 << "Failed to pipeline loop";
417 });
418
419 LI.LoopPipelinerInfo.reset();
420 return Changed;
421 }
422
423 ++NumTrytoPipeline;
424 if (useSwingModuloScheduler())
425 Changed = swingModuloScheduler(L);
426
427 if (useWindowScheduler(Changed))
428 Changed = runWindowScheduler(L);
429
430 LI.LoopPipelinerInfo.reset();
431 return Changed;
432}
433
434void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
435 // Reset the pragma for the next loop in iteration.
436 disabledByPragma = false;
437 II_setByPragma = 0;
438
439 MachineBasicBlock *LBLK = L.getTopBlock();
440
441 if (LBLK == nullptr)
442 return;
443
444 const BasicBlock *BBLK = LBLK->getBasicBlock();
445 if (BBLK == nullptr)
446 return;
447
448 const Instruction *TI = BBLK->getTerminator();
449 if (TI == nullptr)
450 return;
451
452 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
453 if (LoopID == nullptr)
454 return;
455
456 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
457 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
458
459 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
460 MDNode *MD = dyn_cast<MDNode>(MDO);
461
462 if (MD == nullptr)
463 continue;
464
465 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
466
467 if (S == nullptr)
468 continue;
469
470 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
471 assert(MD->getNumOperands() == 2 &&
472 "Pipeline initiation interval hint metadata should have two operands.");
474 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
475 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
476 } else if (S->getString() == "llvm.loop.pipeline.disable") {
477 disabledByPragma = true;
478 }
479 }
480}
481
482/// Depth-first search to detect cycles among PHI dependencies.
483/// Returns true if a cycle is detected within the PHI-only subgraph.
484static bool hasPHICycleDFS(
485 unsigned Reg, const DenseMap<unsigned, SmallVector<unsigned, 2>> &PhiDeps,
486 SmallSet<unsigned, 8> &Visited, SmallSet<unsigned, 8> &RecStack) {
487
488 // If Reg is not a PHI-def it cannot contribute to a PHI cycle.
489 auto It = PhiDeps.find(Reg);
490 if (It == PhiDeps.end())
491 return false;
492
493 if (RecStack.count(Reg))
494 return true; // backedge.
495 if (Visited.count(Reg))
496 return false;
497
498 Visited.insert(Reg);
499 RecStack.insert(Reg);
500
501 for (unsigned Dep : It->second) {
502 if (hasPHICycleDFS(Dep, PhiDeps, Visited, RecStack))
503 return true;
504 }
505
506 RecStack.erase(Reg);
507 return false;
508}
509
510static bool hasPHICycle(const MachineBasicBlock *LoopHeader,
511 const MachineRegisterInfo &MRI) {
513
514 // Collect PHI nodes and their dependencies.
515 for (const MachineInstr &MI : LoopHeader->phis()) {
516 unsigned DefReg = MI.getOperand(0).getReg();
517 auto Ins = PhiDeps.try_emplace(DefReg).first;
518
519 // PHI operands are (Reg, MBB) pairs starting at index 1.
520 for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
521 Ins->second.push_back(MI.getOperand(I).getReg());
522 }
523
524 // DFS to detect cycles among PHI nodes.
525 SmallSet<unsigned, 8> Visited, RecStack;
526
527 // Start DFS from each PHI-def.
528 for (const auto &KV : PhiDeps) {
529 unsigned Reg = KV.first;
530 if (hasPHICycleDFS(Reg, PhiDeps, Visited, RecStack))
531 return true;
532 }
533
534 return false;
535}
536
537/// Return true if the loop can be software pipelined. The algorithm is
538/// restricted to loops with a single basic block. Make sure that the
539/// branch in the loop can be analyzed.
540bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
541 if (L.getNumBlocks() != 1) {
542 ORE->emit([&]() {
543 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
544 L.getStartLoc(), L.getHeader())
545 << "Not a single basic block: "
546 << ore::NV("NumBlocks", L.getNumBlocks());
547 });
548 return false;
549 }
550
551 if (hasPHICycle(L.getHeader(), MF->getRegInfo())) {
552 LLVM_DEBUG(dbgs() << "Cannot pipeline loop due to PHI cycle\n");
553 return false;
554 }
555
556 if (disabledByPragma) {
557 ORE->emit([&]() {
558 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
559 L.getStartLoc(), L.getHeader())
560 << "Disabled by Pragma.";
561 });
562 return false;
563 }
564
565 // Check if the branch can't be understood because we can't do pipelining
566 // if that's the case.
567 LI.TBB = nullptr;
568 LI.FBB = nullptr;
569 LI.BrCond.clear();
570 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
571 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
572 NumFailBranch++;
573 ORE->emit([&]() {
574 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
575 L.getStartLoc(), L.getHeader())
576 << "The branch can't be understood";
577 });
578 return false;
579 }
580
581 LI.LoopInductionVar = nullptr;
582 LI.LoopCompare = nullptr;
583 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
584 if (!LI.LoopPipelinerInfo) {
585 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
586 NumFailLoop++;
587 ORE->emit([&]() {
588 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
589 L.getStartLoc(), L.getHeader())
590 << "The loop structure is not supported";
591 });
592 return false;
593 }
594
595 if (!L.getLoopPreheader()) {
596 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
597 NumFailPreheader++;
598 ORE->emit([&]() {
599 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
600 L.getStartLoc(), L.getHeader())
601 << "No loop preheader found";
602 });
603 return false;
604 }
605
606 unsigned NumStores = 0;
607 for (MachineInstr &MI : *L.getHeader())
608 if (MI.mayStore())
609 ++NumStores;
610 if (NumStores > SwpMaxNumStores) {
611 LLVM_DEBUG(dbgs() << "Too many stores\n");
612 NumFailTooManyStores++;
613 ORE->emit([&]() {
614 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
615 L.getStartLoc(), L.getHeader())
616 << "Too many store instructions in the loop: "
617 << ore::NV("NumStores", NumStores) << " > "
618 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
619 });
620 return false;
621 }
622
623 // Remove any subregisters from inputs to phi nodes.
624 preprocessPhiNodes(*L.getHeader());
625 return true;
626}
627
628void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
629 MachineRegisterInfo &MRI = MF->getRegInfo();
630 SlotIndexes &Slots =
631 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
632
633 for (MachineInstr &PI : B.phis()) {
634 MachineOperand &DefOp = PI.getOperand(0);
635 assert(DefOp.getSubReg() == 0);
636 auto *RC = MRI.getRegClass(DefOp.getReg());
637
638 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
639 MachineOperand &RegOp = PI.getOperand(i);
640 if (RegOp.getSubReg() == 0)
641 continue;
642
643 // If the operand uses a subregister, replace it with a new register
644 // without subregisters, and generate a copy to the new register.
645 Register NewReg = MRI.createVirtualRegister(RC);
646 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
648 const DebugLoc &DL = PredB.findDebugLoc(At);
649 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
650 .addReg(RegOp.getReg(), getRegState(RegOp),
651 RegOp.getSubReg());
652 Slots.insertMachineInstrInMaps(*Copy);
653 RegOp.setReg(NewReg);
654 RegOp.setSubReg(0);
655 }
656 }
657}
658
659/// The SMS algorithm consists of the following main steps:
660/// 1. Computation and analysis of the dependence graph.
661/// 2. Ordering of the nodes (instructions).
662/// 3. Attempt to Schedule the loop.
663bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
664 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
665
666 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
667 SwingSchedulerDAG SMS(
669 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
670
671 MachineBasicBlock *MBB = L.getHeader();
672 // The kernel should not include any terminator instructions. These
673 // will be added back later.
674 SMS.startBlock(MBB);
675
676 // Compute the number of 'real' instructions in the basic block by
677 // ignoring terminators.
678 unsigned size = MBB->size();
680 E = MBB->instr_end();
681 I != E; ++I, --size)
682 ;
683
684 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
685 SMS.schedule();
686 SMS.exitRegion();
687
688 SMS.finishBlock();
689 return SMS.hasNewSchedule();
690}
691
702
703bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
704 MachineSchedContext Context;
705 Context.MF = MF;
706 Context.MLI = MLI;
707 Context.MDT = MDT;
708 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
709 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
710 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
711 Context.RegClassInfo->runOnMachineFunction(*MF);
712 WindowScheduler WS(&Context, L);
713 return WS.run();
714}
715
716bool MachinePipeliner::useSwingModuloScheduler() {
717 // SwingModuloScheduler does not work when WindowScheduler is forced.
719}
720
721bool MachinePipeliner::useWindowScheduler(bool Changed) {
722 // WindowScheduler does not work for following cases:
723 // 1. when it is off.
724 // 2. when SwingModuloScheduler is successfully scheduled.
725 // 3. when pragma II is enabled.
726 if (II_setByPragma) {
727 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
728 "llvm.loop.pipeline.initiationinterval is set.\n");
729 return false;
730 }
731
734}
735
736void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
737 if (SwpForceII > 0)
738 MII = SwpForceII;
739 else if (II_setByPragma > 0)
740 MII = II_setByPragma;
741 else
742 MII = std::max(ResMII, RecMII);
743}
744
745void SwingSchedulerDAG::setMAX_II() {
746 if (SwpForceII > 0)
747 MAX_II = SwpForceII;
748 else if (II_setByPragma > 0)
749 MAX_II = II_setByPragma;
750 else
751 MAX_II = MII + SwpIISearchRange;
752}
753
754/// We override the schedule function in ScheduleDAGInstrs to implement the
755/// scheduling part of the Swing Modulo Scheduling algorithm.
757 buildSchedGraph(AA);
758 const LoopCarriedEdges LCE = addLoopCarriedDependences();
759 updatePhiDependences();
760 Topo.InitDAGTopologicalSorting();
761 changeDependences();
762 postProcessDAG();
763 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
764 LLVM_DEBUG({
765 dump();
766 dbgs() << "===== Loop Carried Edges Begin =====\n";
767 for (SUnit &SU : SUnits)
768 LCE.dump(&SU, TRI, &MRI);
769 dbgs() << "===== Loop Carried Edges End =====\n";
770 });
771
772 NodeSetType NodeSets;
773 findCircuits(NodeSets);
774 NodeSetType Circuits = NodeSets;
775
776 // Calculate the MII.
777 unsigned ResMII = calculateResMII();
778 unsigned RecMII = calculateRecMII(NodeSets);
779
780 fuseRecs(NodeSets);
781
782 // This flag is used for testing and can cause correctness problems.
783 if (SwpIgnoreRecMII)
784 RecMII = 0;
785
786 setMII(ResMII, RecMII);
787 setMAX_II();
788
789 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
790 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
791
792 // Can't schedule a loop without a valid MII.
793 if (MII == 0) {
794 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
795 NumFailZeroMII++;
796 Pass.ORE->emit([&]() {
798 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
799 << "Invalid Minimal Initiation Interval: 0";
800 });
801 return;
802 }
803
804 // Don't pipeline large loops.
805 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
806 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
807 << ", we don't pipeline large loops\n");
808 NumFailLargeMaxMII++;
809 Pass.ORE->emit([&]() {
811 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
812 << "Minimal Initiation Interval too large: "
813 << ore::NV("MII", (int)MII) << " > "
814 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
815 << "Refer to -pipeliner-max-mii.";
816 });
817 return;
818 }
819
820 computeNodeFunctions(NodeSets);
821
822 registerPressureFilter(NodeSets);
823
824 colocateNodeSets(NodeSets);
825
826 checkNodeSets(NodeSets);
827
828 LLVM_DEBUG({
829 for (auto &I : NodeSets) {
830 dbgs() << " Rec NodeSet ";
831 I.dump();
832 }
833 });
834
835 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
836
837 groupRemainingNodes(NodeSets);
838
839 removeDuplicateNodes(NodeSets);
840
841 LLVM_DEBUG({
842 for (auto &I : NodeSets) {
843 dbgs() << " NodeSet ";
844 I.dump();
845 }
846 });
847
848 computeNodeOrder(NodeSets);
849
850 // check for node order issues
851 checkValidNodeOrder(Circuits);
852
853 SMSchedule Schedule(Pass.MF, this);
854 Scheduled = schedulePipeline(Schedule);
855
856 if (!Scheduled){
857 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
858 NumFailNoSchedule++;
859 Pass.ORE->emit([&]() {
861 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
862 << "Unable to find schedule";
863 });
864 return;
865 }
866
867 unsigned numStages = Schedule.getMaxStageCount();
868 // No need to generate pipeline if there are no overlapped iterations.
869 if (numStages == 0) {
870 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
871 NumFailZeroStage++;
872 Pass.ORE->emit([&]() {
874 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
875 << "No need to pipeline - no overlapped iterations in schedule.";
876 });
877 return;
878 }
879 // Check that the maximum stage count is less than user-defined limit.
880 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
881 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
882 << " : too many stages, abort\n");
883 NumFailLargeMaxStage++;
884 Pass.ORE->emit([&]() {
886 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
887 << "Too many stages in schedule: "
888 << ore::NV("numStages", (int)numStages) << " > "
889 << ore::NV("SwpMaxStages", SwpMaxStages)
890 << ". Refer to -pipeliner-max-stages.";
891 });
892 return;
893 }
894
895 Pass.ORE->emit([&]() {
896 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
897 Loop.getHeader())
898 << "Pipelined succesfully!";
899 });
900
901 // Generate the schedule as a ModuloSchedule.
902 DenseMap<MachineInstr *, int> Cycles, Stages;
903 std::vector<MachineInstr *> OrderedInsts;
904 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
905 ++Cycle) {
906 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
907 OrderedInsts.push_back(SU->getInstr());
908 Cycles[SU->getInstr()] = Cycle;
909 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
910 }
911 }
913 for (auto &KV : NewMIs) {
914 Cycles[KV.first] = Cycles[KV.second];
915 Stages[KV.first] = Stages[KV.second];
916 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
917 }
918
919 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
920 std::move(Stages));
922 assert(NewInstrChanges.empty() &&
923 "Cannot serialize a schedule with InstrChanges!");
925 MSTI.annotate();
926 return;
927 }
928 // The experimental code generator can't work if there are InstChanges.
929 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
930 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
931 MSE.expand();
932 } else if (MVECodeGen && NewInstrChanges.empty() &&
933 LoopPipelinerInfo->isMVEExpanderSupported() &&
935 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
936 MSE.expand();
937 } else {
938 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
939 MSE.expand();
940 MSE.cleanup();
941 }
942 ++NumPipelined;
943}
944
945/// Clean up after the software pipeliner runs.
947 for (auto &KV : NewMIs)
948 MF.deleteMachineInstr(KV.second);
949 NewMIs.clear();
950
951 // Call the superclass.
953}
954
955/// Return the register values for the operands of a Phi instruction.
956/// This function assume the instruction is a Phi.
958 Register &InitVal, Register &LoopVal) {
959 assert(Phi.isPHI() && "Expecting a Phi.");
960
961 InitVal = Register();
962 LoopVal = Register();
963 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
964 if (Phi.getOperand(i + 1).getMBB() != Loop)
965 InitVal = Phi.getOperand(i).getReg();
966 else
967 LoopVal = Phi.getOperand(i).getReg();
968
969 assert(InitVal && LoopVal && "Unexpected Phi structure.");
970}
971
972/// Return the Phi register value that comes the loop block.
974 const MachineBasicBlock *LoopBB) {
975 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
976 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
977 return Phi.getOperand(i).getReg();
978 return Register();
979}
980
981/// Return true if SUb can be reached from SUa following the chain edges.
982static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
985 Worklist.push_back(SUa);
986 while (!Worklist.empty()) {
987 const SUnit *SU = Worklist.pop_back_val();
988 for (const auto &SI : SU->Succs) {
989 SUnit *SuccSU = SI.getSUnit();
990 if (SI.getKind() == SDep::Order) {
991 if (Visited.count(SuccSU))
992 continue;
993 if (SuccSU == SUb)
994 return true;
995 Worklist.push_back(SuccSU);
996 Visited.insert(SuccSU);
997 }
998 }
999 }
1000 return false;
1001}
1002
1004 if (!getUnderlyingObjects())
1005 return;
1006 for (const Value *Obj : UnderlyingObjs)
1007 if (!isIdentifiedObject(Obj)) {
1008 IsAllIdentified = false;
1009 break;
1010 }
1011}
1012
1014 const SUnitWithMemInfo &Other) const {
1015 // If all underlying objects are identified objects and there is no overlap
1016 // between them, then these two instructions are disjoint.
1017 if (!IsAllIdentified || !Other.IsAllIdentified)
1018 return false;
1019 for (const Value *Obj : UnderlyingObjs)
1020 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
1021 return false;
1022 return true;
1023}
1024
1025/// Collect the underlying objects for the memory references of an instruction.
1026/// This function calls the code in ValueTracking, but first checks that the
1027/// instruction has a memory operand.
1028/// Returns false if we cannot find the underlying objects.
1029bool SUnitWithMemInfo::getUnderlyingObjects() {
1030 const MachineInstr *MI = SU->getInstr();
1031 if (!MI->hasOneMemOperand())
1032 return false;
1033 MachineMemOperand *MM = *MI->memoperands_begin();
1034 if (!MM->getValue())
1035 return false;
1036 MemOpValue = MM->getValue();
1037 MemOpOffset = MM->getOffset();
1039
1040 // TODO: A no alias scope may be valid only in a single iteration. In this
1041 // case we need to peel off it like LoopAccessAnalysis does.
1042 AATags = MM->getAAInfo();
1043 return true;
1044}
1045
1046/// Returns true if there is a loop-carried order dependency from \p Src to \p
1047/// Dst.
1048static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
1049 const SUnitWithMemInfo &Dst,
1050 BatchAAResults &BAA,
1051 const TargetInstrInfo *TII,
1052 const TargetRegisterInfo *TRI,
1053 const SwingSchedulerDAG *SSD) {
1054 if (Src.isTriviallyDisjoint(Dst))
1055 return false;
1056 if (isSuccOrder(Src.SU, Dst.SU))
1057 return false;
1058
1059 MachineInstr &SrcMI = *Src.SU->getInstr();
1060 MachineInstr &DstMI = *Dst.SU->getInstr();
1061
1062 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1063 return false;
1064
1065 // Second, the more expensive check that uses alias analysis on the
1066 // base registers. If they alias, and the load offset is less than
1067 // the store offset, the mark the dependence as loop carried.
1068 if (Src.isUnknown() || Dst.isUnknown())
1069 return true;
1070 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1071 return true;
1072
1073 if (BAA.isNoAlias(
1074 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1075 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1076 return false;
1077
1078 // AliasAnalysis sometimes gives up on following the underlying
1079 // object. In such a case, separate checks for underlying objects may
1080 // prove that there are no aliases between two accesses.
1081 for (const Value *SrcObj : Src.UnderlyingObjs)
1082 for (const Value *DstObj : Dst.UnderlyingObjs)
1083 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1084 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1085 return true;
1086
1087 return false;
1088}
1089
1090void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1091 const MachineInstr *MI = SU->getInstr();
1092 if (!MI->mayLoadOrStore())
1093 return;
1094 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1095}
1096
1098 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1099 const TargetRegisterInfo *TRI)
1100 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1101 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1102
1104 // Traverse all instructions and extract only what we are targetting.
1105 for (auto &SU : SUnits) {
1106 auto Tagged = getInstrTag(&SU);
1107
1108 // This instruction has no loop-carried order-dependencies.
1109 if (!Tagged)
1110 continue;
1111 TaggedSUnits.emplace_back(&SU, *Tagged);
1112 }
1113
1114 computeDependenciesAux();
1115}
1116
1117std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1118LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1119 MachineInstr *MI = SU->getInstr();
1120 if (TII->isGlobalMemoryObject(MI))
1121 return InstrTag::Barrier;
1122
1123 if (MI->mayStore() ||
1124 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1125 return InstrTag::LoadOrStore;
1126
1127 if (MI->mayRaiseFPException())
1128 return InstrTag::FPExceptions;
1129
1130 return std::nullopt;
1131}
1132
1133void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1134 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst) {
1135 // Avoid self-dependencies.
1136 if (Src.SU == Dst.SU)
1137 return;
1138
1139 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG))
1140 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1141}
1142
1143void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1144 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1145 // Add load-to-store dependencies (WAR).
1146 for (const SUnitWithMemInfo &Src : From.Loads)
1147 for (const SUnitWithMemInfo &Dst : To.Stores)
1148 addDependenciesBetweenSUs(Src, Dst);
1149
1150 // Add store-to-load dependencies (RAW).
1151 for (const SUnitWithMemInfo &Src : From.Stores)
1152 for (const SUnitWithMemInfo &Dst : To.Loads)
1153 addDependenciesBetweenSUs(Src, Dst);
1154
1155 // Add store-to-store dependencies (WAW).
1156 for (const SUnitWithMemInfo &Src : From.Stores)
1157 for (const SUnitWithMemInfo &Dst : To.Stores)
1158 addDependenciesBetweenSUs(Src, Dst);
1159}
1160
1161void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1163 for (const auto &TSU : TaggedSUnits) {
1164 InstrTag Tag = TSU.getTag();
1165 SUnit *SU = TSU.getPointer();
1166 switch (Tag) {
1167 case InstrTag::Barrier:
1168 Chunks.emplace_back();
1169 break;
1170 case InstrTag::LoadOrStore:
1171 Chunks.back().append(SU);
1172 break;
1173 case InstrTag::FPExceptions:
1174 // TODO: Handle this properly.
1175 break;
1176 }
1177 }
1178
1179 // Add dependencies between memory operations. If there are one or more
1180 // barrier events between two memory instructions, we don't add a
1181 // loop-carried dependence for them.
1182 for (const LoadStoreChunk &Chunk : Chunks)
1183 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1184
1185 // TODO: If there are multiple barrier instructions, dependencies from the
1186 // last barrier instruction (or load/store below it) to the first barrier
1187 // instruction (or load/store above it).
1188}
1189
1190/// Add a chain edge between a load and store if the store can be an
1191/// alias of the load on a subsequent iteration, i.e., a loop carried
1192/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1193/// but that code doesn't create loop carried dependences.
1194/// TODO: Also compute output-dependencies.
1195LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1196 LoopCarriedEdges LCE;
1197
1198 // Add loop-carried order-dependencies
1199 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1200 LCODTracker.computeDependencies();
1201 for (unsigned I = 0; I != SUnits.size(); I++)
1202 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1203 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1204
1205 LCE.modifySUnits(SUnits, TII);
1206 return LCE;
1207}
1208
1209/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1210/// processes dependences for PHIs. This function adds true dependences
1211/// from a PHI to a use, and a loop carried dependence from the use to the
1212/// PHI. The loop carried dependence is represented as an anti dependence
1213/// edge. This function also removes chain dependences between unrelated
1214/// PHIs.
1215void SwingSchedulerDAG::updatePhiDependences() {
1216 SmallVector<SDep, 4> RemoveDeps;
1217 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1218
1219 // Iterate over each DAG node.
1220 for (SUnit &I : SUnits) {
1221 RemoveDeps.clear();
1222 // Set to true if the instruction has an operand defined by a Phi.
1223 Register HasPhiUse;
1224 Register HasPhiDef;
1225 MachineInstr *MI = I.getInstr();
1226 // Iterate over each operand, and we process the definitions.
1227 for (const MachineOperand &MO : MI->operands()) {
1228 if (!MO.isReg())
1229 continue;
1230 Register Reg = MO.getReg();
1231 if (MO.isDef()) {
1232 // If the register is used by a Phi, then create an anti dependence.
1234 UI = MRI.use_instr_begin(Reg),
1235 UE = MRI.use_instr_end();
1236 UI != UE; ++UI) {
1237 MachineInstr *UseMI = &*UI;
1238 SUnit *SU = getSUnit(UseMI);
1239 if (SU != nullptr && UseMI->isPHI()) {
1240 if (!MI->isPHI()) {
1241 SDep Dep(SU, SDep::Anti, Reg);
1242 Dep.setLatency(1);
1243 I.addPred(Dep);
1244 } else {
1245 HasPhiDef = Reg;
1246 // Add a chain edge to a dependent Phi that isn't an existing
1247 // predecessor.
1248
1249 // %3:intregs = PHI %21:intregs, %bb.6, %7:intregs, %bb.1 - SU0
1250 // %7:intregs = PHI %21:intregs, %bb.6, %13:intregs, %bb.1 - SU1
1251 // %27:intregs = A2_zxtb %3:intregs - SU2
1252 // %13:intregs = C2_muxri %45:predregs, 0, %46:intreg
1253 // If we have dependent phis, SU0 should be the successor of SU1
1254 // not the other way around. (it used to be SU1 is the successor
1255 // of SU0). In some cases, SU0 is scheduled earlier than SU1
1256 // resulting in bad IR as we do not have a value that can be used
1257 // by SU2.
1258
1259 if (SU->NodeNum < I.NodeNum && !SU->isPred(&I))
1260 SU->addPred(SDep(&I, SDep::Barrier));
1261 }
1262 }
1263 }
1264 } else if (MO.isUse()) {
1265 // If the register is defined by a Phi, then create a true dependence.
1266 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1267 if (DefMI == nullptr)
1268 continue;
1269 SUnit *SU = getSUnit(DefMI);
1270 if (SU != nullptr && DefMI->isPHI()) {
1271 if (!MI->isPHI()) {
1272 SDep Dep(SU, SDep::Data, Reg);
1273 Dep.setLatency(0);
1274 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1275 &SchedModel);
1276 I.addPred(Dep);
1277 } else {
1278 HasPhiUse = Reg;
1279 // Add a chain edge to a dependent Phi that isn't an existing
1280 // predecessor.
1281 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1282 I.addPred(SDep(SU, SDep::Barrier));
1283 }
1284 }
1285 }
1286 }
1287 // Remove order dependences from an unrelated Phi.
1288 if (!SwpPruneDeps)
1289 continue;
1290 for (auto &PI : I.Preds) {
1291 MachineInstr *PMI = PI.getSUnit()->getInstr();
1292 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1293 if (I.getInstr()->isPHI()) {
1294 if (PMI->getOperand(0).getReg() == HasPhiUse)
1295 continue;
1296 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1297 continue;
1298 }
1299 RemoveDeps.push_back(PI);
1300 }
1301 }
1302 for (const SDep &D : RemoveDeps)
1303 I.removePred(D);
1304 }
1305}
1306
1307/// Iterate over each DAG node and see if we can change any dependences
1308/// in order to reduce the recurrence MII.
1309void SwingSchedulerDAG::changeDependences() {
1310 // See if an instruction can use a value from the previous iteration.
1311 // If so, we update the base and offset of the instruction and change
1312 // the dependences.
1313 for (SUnit &I : SUnits) {
1314 unsigned BasePos = 0, OffsetPos = 0;
1315 Register NewBase;
1316 int64_t NewOffset = 0;
1317 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1318 NewOffset))
1319 continue;
1320
1321 // Get the MI and SUnit for the instruction that defines the original base.
1322 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1323 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1324 if (!DefMI)
1325 continue;
1326 SUnit *DefSU = getSUnit(DefMI);
1327 if (!DefSU)
1328 continue;
1329 // Get the MI and SUnit for the instruction that defins the new base.
1330 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1331 if (!LastMI)
1332 continue;
1333 SUnit *LastSU = getSUnit(LastMI);
1334 if (!LastSU)
1335 continue;
1336
1337 if (Topo.IsReachable(&I, LastSU))
1338 continue;
1339
1340 // Remove the dependence. The value now depends on a prior iteration.
1342 for (const SDep &P : I.Preds)
1343 if (P.getSUnit() == DefSU)
1344 Deps.push_back(P);
1345 for (const SDep &D : Deps) {
1346 Topo.RemovePred(&I, D.getSUnit());
1347 I.removePred(D);
1348 }
1349 // Remove the chain dependence between the instructions.
1350 Deps.clear();
1351 for (auto &P : LastSU->Preds)
1352 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1353 Deps.push_back(P);
1354 for (const SDep &D : Deps) {
1355 Topo.RemovePred(LastSU, D.getSUnit());
1356 LastSU->removePred(D);
1357 }
1358
1359 // Add a dependence between the new instruction and the instruction
1360 // that defines the new base.
1361 SDep Dep(&I, SDep::Anti, NewBase);
1362 Topo.AddPred(LastSU, &I);
1363 LastSU->addPred(Dep);
1364
1365 // Remember the base and offset information so that we can update the
1366 // instruction during code generation.
1367 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1368 }
1369}
1370
1371/// Create an instruction stream that represents a single iteration and stage of
1372/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1373/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1374/// function is an approximation of SMSchedule::finalizeSchedule with all
1375/// non-const operations removed.
1377 SMSchedule &Schedule,
1378 std::vector<MachineInstr *> &OrderedInsts,
1381
1382 // Move all instructions to the first stage from the later stages.
1383 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1384 ++Cycle) {
1385 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1386 Stage <= LastStage; ++Stage) {
1387 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1388 Cycle + Stage * Schedule.getInitiationInterval()))) {
1389 Instrs[Cycle].push_front(SU);
1390 }
1391 }
1392 }
1393
1394 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1395 ++Cycle) {
1396 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1397 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1398 for (SUnit *SU : CycleInstrs) {
1399 MachineInstr *MI = SU->getInstr();
1400 OrderedInsts.push_back(MI);
1401 Stages[MI] = Schedule.stageScheduled(SU);
1402 }
1403 }
1404}
1405
1406namespace {
1407
1408// FuncUnitSorter - Comparison operator used to sort instructions by
1409// the number of functional unit choices.
1410struct FuncUnitSorter {
1411 const InstrItineraryData *InstrItins;
1412 const MCSubtargetInfo *STI;
1413 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1414
1415 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1416 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1417
1418 // Compute the number of functional unit alternatives needed
1419 // at each stage, and take the minimum value. We prioritize the
1420 // instructions by the least number of choices first.
1421 unsigned minFuncUnits(const MachineInstr *Inst,
1422 InstrStage::FuncUnits &F) const {
1423 unsigned SchedClass = Inst->getDesc().getSchedClass();
1424 unsigned min = UINT_MAX;
1425 if (InstrItins && !InstrItins->isEmpty()) {
1426 for (const InstrStage &IS :
1427 make_range(InstrItins->beginStage(SchedClass),
1428 InstrItins->endStage(SchedClass))) {
1429 InstrStage::FuncUnits funcUnits = IS.getUnits();
1430 unsigned numAlternatives = llvm::popcount(funcUnits);
1431 if (numAlternatives < min) {
1432 min = numAlternatives;
1433 F = funcUnits;
1434 }
1435 }
1436 return min;
1437 }
1438 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1439 const MCSchedClassDesc *SCDesc =
1440 STI->getSchedModel().getSchedClassDesc(SchedClass);
1441 if (!SCDesc->isValid())
1442 // No valid Schedule Class Desc for schedClass, should be
1443 // Pseudo/PostRAPseudo
1444 return min;
1445
1446 for (const MCWriteProcResEntry &PRE :
1447 make_range(STI->getWriteProcResBegin(SCDesc),
1448 STI->getWriteProcResEnd(SCDesc))) {
1449 if (!PRE.ReleaseAtCycle)
1450 continue;
1451 const MCProcResourceDesc *ProcResource =
1452 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1453 unsigned NumUnits = ProcResource->NumUnits;
1454 if (NumUnits < min) {
1455 min = NumUnits;
1456 F = PRE.ProcResourceIdx;
1457 }
1458 }
1459 return min;
1460 }
1461 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1462 }
1463
1464 // Compute the critical resources needed by the instruction. This
1465 // function records the functional units needed by instructions that
1466 // must use only one functional unit. We use this as a tie breaker
1467 // for computing the resource MII. The instrutions that require
1468 // the same, highly used, functional unit have high priority.
1469 void calcCriticalResources(MachineInstr &MI) {
1470 unsigned SchedClass = MI.getDesc().getSchedClass();
1471 if (InstrItins && !InstrItins->isEmpty()) {
1472 for (const InstrStage &IS :
1473 make_range(InstrItins->beginStage(SchedClass),
1474 InstrItins->endStage(SchedClass))) {
1475 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1476 if (llvm::popcount(FuncUnits) == 1)
1477 Resources[FuncUnits]++;
1478 }
1479 return;
1480 }
1481 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1482 const MCSchedClassDesc *SCDesc =
1483 STI->getSchedModel().getSchedClassDesc(SchedClass);
1484 if (!SCDesc->isValid())
1485 // No valid Schedule Class Desc for schedClass, should be
1486 // Pseudo/PostRAPseudo
1487 return;
1488
1489 for (const MCWriteProcResEntry &PRE :
1490 make_range(STI->getWriteProcResBegin(SCDesc),
1491 STI->getWriteProcResEnd(SCDesc))) {
1492 if (!PRE.ReleaseAtCycle)
1493 continue;
1494 Resources[PRE.ProcResourceIdx]++;
1495 }
1496 return;
1497 }
1498 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1499 }
1500
1501 /// Return true if IS1 has less priority than IS2.
1502 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1503 InstrStage::FuncUnits F1 = 0, F2 = 0;
1504 unsigned MFUs1 = minFuncUnits(IS1, F1);
1505 unsigned MFUs2 = minFuncUnits(IS2, F2);
1506 if (MFUs1 == MFUs2)
1507 return Resources.lookup(F1) < Resources.lookup(F2);
1508 return MFUs1 > MFUs2;
1509 }
1510};
1511
1512/// Calculate the maximum register pressure of the scheduled instructions stream
1513class HighRegisterPressureDetector {
1514 MachineBasicBlock *OrigMBB;
1515 const MachineRegisterInfo &MRI;
1516 const TargetRegisterInfo *TRI;
1517
1518 const unsigned PSetNum;
1519
1520 // Indexed by PSet ID
1521 // InitSetPressure takes into account the register pressure of live-in
1522 // registers. It's not depend on how the loop is scheduled, so it's enough to
1523 // calculate them once at the beginning.
1524 std::vector<unsigned> InitSetPressure;
1525
1526 // Indexed by PSet ID
1527 // Upper limit for each register pressure set
1528 std::vector<unsigned> PressureSetLimit;
1529
1530 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1531
1532 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1533
1534public:
1535 using OrderedInstsTy = std::vector<MachineInstr *>;
1536 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1537
1538private:
1539 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1540 if (Pressures.size() == 0) {
1541 dbgs() << "[]";
1542 } else {
1543 char Prefix = '[';
1544 for (unsigned P : Pressures) {
1545 dbgs() << Prefix << P;
1546 Prefix = ' ';
1547 }
1548 dbgs() << ']';
1549 }
1550 }
1551
1552 void dumpPSet(Register Reg) const {
1553 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1554 // FIXME: The static_cast is a bug compensating bugs in the callers.
1555 VirtRegOrUnit VRegOrUnit =
1556 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1557 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1558 for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
1559 ++PSetIter) {
1560 dbgs() << *PSetIter << ' ';
1561 }
1562 dbgs() << '\n';
1563 }
1564
1565 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1566 Register Reg) const {
1567 // FIXME: The static_cast is a bug compensating bugs in the callers.
1568 VirtRegOrUnit VRegOrUnit =
1569 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1570 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1571 auto PSetIter = MRI.getPressureSets(VRegOrUnit);
1572 unsigned Weight = PSetIter.getWeight();
1573 for (; PSetIter.isValid(); ++PSetIter)
1574 Pressure[*PSetIter] += Weight;
1575 }
1576
1577 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1578 Register Reg) const {
1579 auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
1580 unsigned Weight = PSetIter.getWeight();
1581 for (; PSetIter.isValid(); ++PSetIter) {
1582 auto &P = Pressure[*PSetIter];
1583 assert(P >= Weight &&
1584 "register pressure must be greater than or equal weight");
1585 P -= Weight;
1586 }
1587 }
1588
1589 // Return true if Reg is reserved one, for example, stack pointer
1590 bool isReservedRegister(Register Reg) const {
1591 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1592 }
1593
1594 bool isDefinedInThisLoop(Register Reg) const {
1595 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1596 }
1597
1598 // Search for live-in variables. They are factored into the register pressure
1599 // from the begining. Live-in variables used by every iteration should be
1600 // considered as alive throughout the loop. For example, the variable `c` in
1601 // following code. \code
1602 // int c = ...;
1603 // for (int i = 0; i < n; i++)
1604 // a[i] += b[i] + c;
1605 // \endcode
1606 void computeLiveIn() {
1607 DenseSet<Register> Used;
1608 for (auto &MI : *OrigMBB) {
1609 if (MI.isDebugInstr())
1610 continue;
1611 for (auto &Use : ROMap[&MI].Uses) {
1612 // FIXME: The static_cast is a bug.
1613 Register Reg =
1614 Use.VRegOrUnit.isVirtualReg()
1615 ? Use.VRegOrUnit.asVirtualReg()
1616 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1617 // Ignore the variable that appears only on one side of phi instruction
1618 // because it's used only at the first iteration.
1619 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1620 continue;
1621 if (isReservedRegister(Reg))
1622 continue;
1623 if (isDefinedInThisLoop(Reg))
1624 continue;
1625 Used.insert(Reg);
1626 }
1627 }
1628
1629 for (auto LiveIn : Used)
1630 increaseRegisterPressure(InitSetPressure, LiveIn);
1631 }
1632
1633 // Calculate the upper limit of each pressure set
1634 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1635 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1636 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1637 }
1638
1639 // There are two patterns of last-use.
1640 // - by an instruction of the current iteration
1641 // - by a phi instruction of the next iteration (loop carried value)
1642 //
1643 // Furthermore, following two groups of instructions are executed
1644 // simultaneously
1645 // - next iteration's phi instructions in i-th stage
1646 // - current iteration's instructions in i+1-th stage
1647 //
1648 // This function calculates the last-use of each register while taking into
1649 // account the above two patterns.
1650 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1651 Instr2StageTy &Stages) const {
1652 // We treat virtual registers that are defined and used in this loop.
1653 // Following virtual register will be ignored
1654 // - live-in one
1655 // - defined but not used in the loop (potentially live-out)
1656 DenseSet<Register> TargetRegs;
1657 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1658 if (isDefinedInThisLoop(Reg))
1659 TargetRegs.insert(Reg);
1660 };
1661 for (MachineInstr *MI : OrderedInsts) {
1662 if (MI->isPHI()) {
1663 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1664 UpdateTargetRegs(Reg);
1665 } else {
1666 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1667 // FIXME: The static_cast is a bug.
1668 Register Reg = Use.VRegOrUnit.isVirtualReg()
1669 ? Use.VRegOrUnit.asVirtualReg()
1670 : Register(static_cast<unsigned>(
1671 Use.VRegOrUnit.asMCRegUnit()));
1672 UpdateTargetRegs(Reg);
1673 }
1674 }
1675 }
1676
1677 const auto InstrScore = [&Stages](MachineInstr *MI) {
1678 return Stages[MI] + MI->isPHI();
1679 };
1680
1681 DenseMap<Register, MachineInstr *> LastUseMI;
1682 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1683 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1684 // FIXME: The static_cast is a bug.
1685 Register Reg =
1686 Use.VRegOrUnit.isVirtualReg()
1687 ? Use.VRegOrUnit.asVirtualReg()
1688 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1689 if (!TargetRegs.contains(Reg))
1690 continue;
1691 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1692 if (!Inserted) {
1693 MachineInstr *Orig = Ite->second;
1694 MachineInstr *New = MI;
1695 if (InstrScore(Orig) < InstrScore(New))
1696 Ite->second = New;
1697 }
1698 }
1699 }
1700
1701 Instr2LastUsesTy LastUses;
1702 for (auto [Reg, MI] : LastUseMI)
1703 LastUses[MI].insert(Reg);
1704 return LastUses;
1705 }
1706
1707 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1708 // iterations and check the register pressure at the point where all stages
1709 // overlapping.
1710 //
1711 // An example of unrolled loop where #Stage is 4..
1712 // Iter i+0 i+1 i+2 i+3
1713 // ------------------------
1714 // Stage 0
1715 // Stage 1 0
1716 // Stage 2 1 0
1717 // Stage 3 2 1 0 <- All stages overlap
1718 //
1719 std::vector<unsigned>
1720 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1721 Instr2StageTy &Stages,
1722 const unsigned StageCount) const {
1723 using RegSetTy = SmallDenseSet<Register, 16>;
1724
1725 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1726 // manage the liveness of the registers independently by iterations.
1727 SmallVector<RegSetTy> LiveRegSets(StageCount);
1728
1729 auto CurSetPressure = InitSetPressure;
1730 auto MaxSetPressure = InitSetPressure;
1731 auto LastUses = computeLastUses(OrderedInsts, Stages);
1732
1733 LLVM_DEBUG({
1734 dbgs() << "Ordered instructions:\n";
1735 for (MachineInstr *MI : OrderedInsts) {
1736 dbgs() << "Stage " << Stages[MI] << ": ";
1737 MI->dump();
1738 }
1739 });
1740
1741 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1742 VirtRegOrUnit VRegOrUnit) {
1743 // FIXME: The static_cast is a bug.
1744 Register Reg =
1745 VRegOrUnit.isVirtualReg()
1746 ? VRegOrUnit.asVirtualReg()
1747 : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
1748 if (!Reg.isValid() || isReservedRegister(Reg))
1749 return;
1750
1751 bool Inserted = RegSet.insert(Reg).second;
1752 if (!Inserted)
1753 return;
1754
1755 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1756 increaseRegisterPressure(CurSetPressure, Reg);
1757 LLVM_DEBUG(dumpPSet(Reg));
1758 };
1759
1760 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1761 Register Reg) {
1762 if (!Reg.isValid() || isReservedRegister(Reg))
1763 return;
1764
1765 // live-in register
1766 if (!RegSet.contains(Reg))
1767 return;
1768
1769 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1770 RegSet.erase(Reg);
1771 decreaseRegisterPressure(CurSetPressure, Reg);
1772 LLVM_DEBUG(dumpPSet(Reg));
1773 };
1774
1775 for (unsigned I = 0; I < StageCount; I++) {
1776 for (MachineInstr *MI : OrderedInsts) {
1777 const auto Stage = Stages[MI];
1778 if (I < Stage)
1779 continue;
1780
1781 const unsigned Iter = I - Stage;
1782
1783 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1784 InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
1785
1786 for (auto LastUse : LastUses[MI]) {
1787 if (MI->isPHI()) {
1788 if (Iter != 0)
1789 EraseReg(LiveRegSets[Iter - 1], LastUse);
1790 } else {
1791 EraseReg(LiveRegSets[Iter], LastUse);
1792 }
1793 }
1794
1795 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1796 MaxSetPressure[PSet] =
1797 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1798
1799 LLVM_DEBUG({
1800 dbgs() << "CurSetPressure=";
1801 dumpRegisterPressures(CurSetPressure);
1802 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1803 MI->dump();
1804 });
1805 }
1806 }
1807
1808 return MaxSetPressure;
1809 }
1810
1811public:
1812 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1813 const MachineFunction &MF)
1814 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1815 TRI(MF.getSubtarget().getRegisterInfo()),
1816 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1817 PressureSetLimit(PSetNum, 0) {}
1818
1819 // Used to calculate register pressure, which is independent of loop
1820 // scheduling.
1821 void init(const RegisterClassInfo &RCI) {
1822 for (MachineInstr &MI : *OrigMBB) {
1823 if (MI.isDebugInstr())
1824 continue;
1825 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1826 }
1827
1828 computeLiveIn();
1829 computePressureSetLimit(RCI);
1830 }
1831
1832 // Calculate the maximum register pressures of the loop and check if they
1833 // exceed the limit
1834 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1835 const unsigned MaxStage) const {
1837 "the percentage of the margin must be between 0 to 100");
1838
1839 OrderedInstsTy OrderedInsts;
1840 Instr2StageTy Stages;
1841 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1842 const auto MaxSetPressure =
1843 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1844
1845 LLVM_DEBUG({
1846 dbgs() << "Dump MaxSetPressure:\n";
1847 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1848 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1849 }
1850 dbgs() << '\n';
1851 });
1852
1853 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1854 unsigned Limit = PressureSetLimit[PSet];
1855 unsigned Margin = Limit * RegPressureMargin / 100;
1856 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1857 << " Margin=" << Margin << "\n");
1858 if (Limit < MaxSetPressure[PSet] + Margin) {
1859 LLVM_DEBUG(
1860 dbgs()
1861 << "Rejected the schedule because of too high register pressure\n");
1862 return true;
1863 }
1864 }
1865 return false;
1866 }
1867};
1868
1869} // end anonymous namespace
1870
1871/// Calculate the resource constrained minimum initiation interval for the
1872/// specified loop. We use the DFA to model the resources needed for
1873/// each instruction, and we ignore dependences. A different DFA is created
1874/// for each cycle that is required. When adding a new instruction, we attempt
1875/// to add it to each existing DFA, until a legal space is found. If the
1876/// instruction cannot be reserved in an existing DFA, we create a new one.
1877unsigned SwingSchedulerDAG::calculateResMII() {
1878 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1879 ResourceManager RM(&MF.getSubtarget(), this);
1880 return RM.calculateResMII();
1881}
1882
1883/// Calculate the recurrence-constrainted minimum initiation interval.
1884/// Iterate over each circuit. Compute the delay(c) and distance(c)
1885/// for each circuit. The II needs to satisfy the inequality
1886/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1887/// II that satisfies the inequality, and the RecMII is the maximum
1888/// of those values.
1889unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1890 unsigned RecMII = 0;
1891
1892 for (NodeSet &Nodes : NodeSets) {
1893 if (Nodes.empty())
1894 continue;
1895
1896 unsigned Delay = Nodes.getLatency();
1897 unsigned Distance = 1;
1898
1899 // ii = ceil(delay / distance)
1900 unsigned CurMII = (Delay + Distance - 1) / Distance;
1901 Nodes.setRecMII(CurMII);
1902 if (CurMII > RecMII)
1903 RecMII = CurMII;
1904 }
1905
1906 return RecMII;
1907}
1908
1909/// Create the adjacency structure of the nodes in the graph.
1910void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1911 SwingSchedulerDAG *DAG) {
1912 BitVector Added(SUnits.size());
1913 DenseMap<int, int> OutputDeps;
1914 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1915 Added.reset();
1916 // Add any successor to the adjacency matrix and exclude duplicates.
1917 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1918 // Only create a back-edge on the first and last nodes of a dependence
1919 // chain. This records any chains and adds them later.
1920 if (OE.isOutputDep()) {
1921 int N = OE.getDst()->NodeNum;
1922 int BackEdge = i;
1923 auto Dep = OutputDeps.find(BackEdge);
1924 if (Dep != OutputDeps.end()) {
1925 BackEdge = Dep->second;
1926 OutputDeps.erase(Dep);
1927 }
1928 OutputDeps[N] = BackEdge;
1929 }
1930 // Do not process a boundary node, an artificial node.
1931 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1932 continue;
1933
1934 // This code is retained o preserve previous behavior and prevent
1935 // regression. This condition means that anti-dependnecies within an
1936 // iteration are ignored when searching circuits. Therefore it's natural
1937 // to consider this dependence as well.
1938 // FIXME: Remove this code if it doesn't have significant impact on
1939 // performance.
1940 if (OE.isAntiDep())
1941 continue;
1942
1943 int N = OE.getDst()->NodeNum;
1944 if (!Added.test(N)) {
1945 AdjK[i].push_back(N);
1946 Added.set(N);
1947 }
1948 }
1949 // A chain edge between a store and a load is treated as a back-edge in the
1950 // adjacency matrix.
1951 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1952 SUnit *Src = IE.getSrc();
1953 SUnit *Dst = IE.getDst();
1954 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1955 continue;
1956 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1957 int N = Src->NodeNum;
1958 if (!Added.test(N)) {
1959 AdjK[i].push_back(N);
1960 Added.set(N);
1961 }
1962 }
1963 }
1964 }
1965 // Add back-edges in the adjacency matrix for the output dependences.
1966 for (auto &OD : OutputDeps)
1967 if (!Added.test(OD.second)) {
1968 AdjK[OD.first].push_back(OD.second);
1969 Added.set(OD.second);
1970 }
1971}
1972
1973/// Identify an elementary circuit in the dependence graph starting at the
1974/// specified node.
1975bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1976 const SwingSchedulerDAG *DAG,
1977 bool HasBackedge) {
1978 SUnit *SV = &SUnits[V];
1979 bool F = false;
1980 Stack.insert(SV);
1981 Blocked.set(V);
1982
1983 for (auto W : AdjK[V]) {
1984 if (NumPaths > MaxPaths)
1985 break;
1986 if (W < S)
1987 continue;
1988 if (W == S) {
1989 if (!HasBackedge)
1990 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1991 F = true;
1992 ++NumPaths;
1993 break;
1994 }
1995 if (!Blocked.test(W)) {
1996 if (circuit(W, S, NodeSets, DAG,
1997 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1998 F = true;
1999 }
2000 }
2001
2002 if (F)
2003 unblock(V);
2004 else {
2005 for (auto W : AdjK[V]) {
2006 if (W < S)
2007 continue;
2008 B[W].insert(SV);
2009 }
2010 }
2011 Stack.pop_back();
2012 return F;
2013}
2014
2015/// Unblock a node in the circuit finding algorithm.
2016void SwingSchedulerDAG::Circuits::unblock(int U) {
2017 Blocked.reset(U);
2018 SmallPtrSet<SUnit *, 4> &BU = B[U];
2019 while (!BU.empty()) {
2020 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
2021 assert(SI != BU.end() && "Invalid B set.");
2022 SUnit *W = *SI;
2023 BU.erase(W);
2024 if (Blocked.test(W->NodeNum))
2025 unblock(W->NodeNum);
2026 }
2027}
2028
2029/// Identify all the elementary circuits in the dependence graph using
2030/// Johnson's circuit algorithm.
2031void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
2032 Circuits Cir(SUnits, Topo);
2033 // Create the adjacency structure.
2034 Cir.createAdjacencyStructure(this);
2035 for (int I = 0, E = SUnits.size(); I != E; ++I) {
2036 Cir.reset();
2037 Cir.circuit(I, I, NodeSets, this);
2038 }
2039}
2040
2041// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
2042// is loop-carried to the USE in next iteration. This will help pipeliner avoid
2043// additional copies that are needed across iterations. An artificial dependence
2044// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
2045
2046// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
2047// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
2048// PHI-------True-Dep------> USEOfPhi
2049
2050// The mutation creates
2051// USEOfPHI -------Artificial-Dep---> SRCOfCopy
2052
2053// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
2054// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
2055// late to avoid additional copies across iterations. The possible scheduling
2056// order would be
2057// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
2058
2059void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
2060 for (SUnit &SU : DAG->SUnits) {
2061 // Find the COPY/REG_SEQUENCE instruction.
2062 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
2063 continue;
2064
2065 // Record the loop carried PHIs.
2067 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2069
2070 for (auto &Dep : SU.Preds) {
2071 SUnit *TmpSU = Dep.getSUnit();
2072 MachineInstr *TmpMI = TmpSU->getInstr();
2073 SDep::Kind DepKind = Dep.getKind();
2074 // Save the loop carried PHI.
2075 if (DepKind == SDep::Anti && TmpMI->isPHI())
2076 PHISUs.push_back(TmpSU);
2077 // Save the source of COPY/REG_SEQUENCE.
2078 // If the source has no pre-decessors, we will end up creating cycles.
2079 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2080 SrcSUs.push_back(TmpSU);
2081 }
2082
2083 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2084 continue;
2085
2086 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2087 // SUnit to the container.
2089 // Do not use iterator based loop here as we are updating the container.
2090 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2091 for (auto &Dep : PHISUs[Index]->Succs) {
2092 if (Dep.getKind() != SDep::Data)
2093 continue;
2094
2095 SUnit *TmpSU = Dep.getSUnit();
2096 MachineInstr *TmpMI = TmpSU->getInstr();
2097 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2098 PHISUs.push_back(TmpSU);
2099 continue;
2100 }
2101 UseSUs.push_back(TmpSU);
2102 }
2103 }
2104
2105 if (UseSUs.size() == 0)
2106 continue;
2107
2108 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2109 // Add the artificial dependencies if it does not form a cycle.
2110 for (auto *I : UseSUs) {
2111 for (auto *Src : SrcSUs) {
2112 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2113 Src->addPred(SDep(I, SDep::Artificial));
2114 SDAG->Topo.AddPred(Src, I);
2115 }
2116 }
2117 }
2118 }
2119}
2120
2121/// Compute several functions need to order the nodes for scheduling.
2122/// ASAP - Earliest time to schedule a node.
2123/// ALAP - Latest time to schedule a node.
2124/// MOV - Mobility function, difference between ALAP and ASAP.
2125/// D - Depth of each node.
2126/// H - Height of each node.
2127void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2128 ScheduleInfo.resize(SUnits.size());
2129
2130 LLVM_DEBUG({
2131 for (int I : Topo) {
2132 const SUnit &SU = SUnits[I];
2133 dumpNode(SU);
2134 }
2135 });
2136
2137 int maxASAP = 0;
2138 // Compute ASAP and ZeroLatencyDepth.
2139 for (int I : Topo) {
2140 int asap = 0;
2141 int zeroLatencyDepth = 0;
2142 SUnit *SU = &SUnits[I];
2143 for (const auto &IE : DDG->getInEdges(SU)) {
2144 SUnit *Pred = IE.getSrc();
2145 if (IE.getLatency() == 0)
2146 zeroLatencyDepth =
2147 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2148 if (IE.ignoreDependence(true))
2149 continue;
2150 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2151 IE.getDistance() * MII));
2152 }
2153 maxASAP = std::max(maxASAP, asap);
2154 ScheduleInfo[I].ASAP = asap;
2155 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2156 }
2157
2158 // Compute ALAP, ZeroLatencyHeight, and MOV.
2159 for (int I : llvm::reverse(Topo)) {
2160 int alap = maxASAP;
2161 int zeroLatencyHeight = 0;
2162 SUnit *SU = &SUnits[I];
2163 for (const auto &OE : DDG->getOutEdges(SU)) {
2164 SUnit *Succ = OE.getDst();
2165 if (Succ->isBoundaryNode())
2166 continue;
2167 if (OE.getLatency() == 0)
2168 zeroLatencyHeight =
2169 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2170 if (OE.ignoreDependence(true))
2171 continue;
2172 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2173 OE.getDistance() * MII));
2174 }
2175
2176 ScheduleInfo[I].ALAP = alap;
2177 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2178 }
2179
2180 // After computing the node functions, compute the summary for each node set.
2181 for (NodeSet &I : NodeSets)
2182 I.computeNodeSetInfo(this);
2183
2184 LLVM_DEBUG({
2185 for (unsigned i = 0; i < SUnits.size(); i++) {
2186 dbgs() << "\tNode " << i << ":\n";
2187 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2188 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2189 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2190 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2191 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2192 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2193 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2194 }
2195 });
2196}
2197
2198/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2199/// as the predecessors of the elements of NodeOrder that are not also in
2200/// NodeOrder.
2203 const NodeSet *S = nullptr) {
2204 Preds.clear();
2205
2206 for (SUnit *SU : NodeOrder) {
2207 for (const auto &IE : DDG->getInEdges(SU)) {
2208 SUnit *PredSU = IE.getSrc();
2209 if (S && S->count(PredSU) == 0)
2210 continue;
2211 if (IE.ignoreDependence(true))
2212 continue;
2213 if (NodeOrder.count(PredSU) == 0)
2214 Preds.insert(PredSU);
2215 }
2216
2217 // FIXME: The following loop-carried dependencies may also need to be
2218 // considered.
2219 // - Physical register dependencies (true-dependence and WAW).
2220 // - Memory dependencies.
2221 for (const auto &OE : DDG->getOutEdges(SU)) {
2222 SUnit *SuccSU = OE.getDst();
2223 if (!OE.isAntiDep())
2224 continue;
2225 if (S && S->count(SuccSU) == 0)
2226 continue;
2227 if (NodeOrder.count(SuccSU) == 0)
2228 Preds.insert(SuccSU);
2229 }
2230 }
2231 return !Preds.empty();
2232}
2233
2234/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2235/// as the successors of the elements of NodeOrder that are not also in
2236/// NodeOrder.
2239 const NodeSet *S = nullptr) {
2240 Succs.clear();
2241
2242 for (SUnit *SU : NodeOrder) {
2243 for (const auto &OE : DDG->getOutEdges(SU)) {
2244 SUnit *SuccSU = OE.getDst();
2245 if (S && S->count(SuccSU) == 0)
2246 continue;
2247 if (OE.ignoreDependence(false))
2248 continue;
2249 if (NodeOrder.count(SuccSU) == 0)
2250 Succs.insert(SuccSU);
2251 }
2252
2253 // FIXME: The following loop-carried dependencies may also need to be
2254 // considered.
2255 // - Physical register dependnecies (true-dependnece and WAW).
2256 // - Memory dependencies.
2257 for (const auto &IE : DDG->getInEdges(SU)) {
2258 SUnit *PredSU = IE.getSrc();
2259 if (!IE.isAntiDep())
2260 continue;
2261 if (S && S->count(PredSU) == 0)
2262 continue;
2263 if (NodeOrder.count(PredSU) == 0)
2264 Succs.insert(PredSU);
2265 }
2266 }
2267 return !Succs.empty();
2268}
2269
2270/// Return true if there is a path from the specified node to any of the nodes
2271/// in DestNodes. Keep track and return the nodes in any path.
2272static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2273 SetVector<SUnit *> &DestNodes,
2274 SetVector<SUnit *> &Exclude,
2275 SmallPtrSet<SUnit *, 8> &Visited,
2276 SwingSchedulerDDG *DDG) {
2277 if (Cur->isBoundaryNode())
2278 return false;
2279 if (Exclude.contains(Cur))
2280 return false;
2281 if (DestNodes.contains(Cur))
2282 return true;
2283 if (!Visited.insert(Cur).second)
2284 return Path.contains(Cur);
2285 bool FoundPath = false;
2286 for (const auto &OE : DDG->getOutEdges(Cur))
2287 if (!OE.ignoreDependence(false))
2288 FoundPath |=
2289 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2290 for (const auto &IE : DDG->getInEdges(Cur))
2291 if (IE.isAntiDep() && IE.getDistance() == 0)
2292 FoundPath |=
2293 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2294 if (FoundPath)
2295 Path.insert(Cur);
2296 return FoundPath;
2297}
2298
2299/// Compute the live-out registers for the instructions in a node-set.
2300/// The live-out registers are those that are defined in the node-set,
2301/// but not used. Except for use operands of Phis.
2303 NodeSet &NS) {
2308 for (SUnit *SU : NS) {
2309 const MachineInstr *MI = SU->getInstr();
2310 if (MI->isPHI())
2311 continue;
2312 for (const MachineOperand &MO : MI->all_uses()) {
2313 Register Reg = MO.getReg();
2314 if (Reg.isVirtual())
2315 Uses.insert(VirtRegOrUnit(Reg));
2316 else if (MRI.isAllocatable(Reg))
2317 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2318 Uses.insert(VirtRegOrUnit(Unit));
2319 }
2320 }
2321 for (SUnit *SU : NS)
2322 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2323 if (!MO.isDead()) {
2324 Register Reg = MO.getReg();
2325 if (Reg.isVirtual()) {
2326 if (!Uses.count(VirtRegOrUnit(Reg)))
2327 LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
2329 } else if (MRI.isAllocatable(Reg)) {
2330 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2331 if (!Uses.count(VirtRegOrUnit(Unit)))
2332 LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
2334 }
2335 }
2336 RPTracker.addLiveRegs(LiveOutRegs);
2337}
2338
2339/// A heuristic to filter nodes in recurrent node-sets if the register
2340/// pressure of a set is too high.
2341void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2342 for (auto &NS : NodeSets) {
2343 // Skip small node-sets since they won't cause register pressure problems.
2344 if (NS.size() <= 2)
2345 continue;
2346 IntervalPressure RecRegPressure;
2347 RegPressureTracker RecRPTracker(RecRegPressure);
2348 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2349 computeLiveOuts(MF, RecRPTracker, NS);
2350 RecRPTracker.closeBottom();
2351
2352 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2353 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2354 return A->NodeNum > B->NodeNum;
2355 });
2356
2357 for (auto &SU : SUnits) {
2358 // Since we're computing the register pressure for a subset of the
2359 // instructions in a block, we need to set the tracker for each
2360 // instruction in the node-set. The tracker is set to the instruction
2361 // just after the one we're interested in.
2363 RecRPTracker.setPos(std::next(CurInstI));
2364
2365 RegPressureDelta RPDelta;
2366 ArrayRef<PressureChange> CriticalPSets;
2367 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2368 CriticalPSets,
2369 RecRegPressure.MaxSetPressure);
2370 if (RPDelta.Excess.isValid()) {
2371 LLVM_DEBUG(
2372 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2373 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2374 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2375 NS.setExceedPressure(SU);
2376 break;
2377 }
2378 RecRPTracker.recede();
2379 }
2380 }
2381}
2382
2383/// A heuristic to colocate node sets that have the same set of
2384/// successors.
2385void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2386 unsigned Colocate = 0;
2387 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2388 NodeSet &N1 = NodeSets[i];
2389 SmallSetVector<SUnit *, 8> S1;
2390 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2391 continue;
2392 for (int j = i + 1; j < e; ++j) {
2393 NodeSet &N2 = NodeSets[j];
2394 if (N1.compareRecMII(N2) != 0)
2395 continue;
2396 SmallSetVector<SUnit *, 8> S2;
2397 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2398 continue;
2399 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2400 N1.setColocate(++Colocate);
2401 N2.setColocate(Colocate);
2402 break;
2403 }
2404 }
2405 }
2406}
2407
2408/// Check if the existing node-sets are profitable. If not, then ignore the
2409/// recurrent node-sets, and attempt to schedule all nodes together. This is
2410/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2411/// then it's best to try to schedule all instructions together instead of
2412/// starting with the recurrent node-sets.
2413void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2414 // Look for loops with a large MII.
2415 if (MII < 17)
2416 return;
2417 // Check if the node-set contains only a simple add recurrence.
2418 for (auto &NS : NodeSets) {
2419 if (NS.getRecMII() > 2)
2420 return;
2421 if (NS.getMaxDepth() > MII)
2422 return;
2423 }
2424 NodeSets.clear();
2425 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2426}
2427
2428/// Add the nodes that do not belong to a recurrence set into groups
2429/// based upon connected components.
2430void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2431 SetVector<SUnit *> NodesAdded;
2432 SmallPtrSet<SUnit *, 8> Visited;
2433 // Add the nodes that are on a path between the previous node sets and
2434 // the current node set.
2435 for (NodeSet &I : NodeSets) {
2436 SmallSetVector<SUnit *, 8> N;
2437 // Add the nodes from the current node set to the previous node set.
2438 if (succ_L(I, N, DDG.get())) {
2439 SetVector<SUnit *> Path;
2440 for (SUnit *NI : N) {
2441 Visited.clear();
2442 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2443 }
2444 if (!Path.empty())
2445 I.insert(Path.begin(), Path.end());
2446 }
2447 // Add the nodes from the previous node set to the current node set.
2448 N.clear();
2449 if (succ_L(NodesAdded, N, DDG.get())) {
2450 SetVector<SUnit *> Path;
2451 for (SUnit *NI : N) {
2452 Visited.clear();
2453 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2454 }
2455 if (!Path.empty())
2456 I.insert(Path.begin(), Path.end());
2457 }
2458 NodesAdded.insert_range(I);
2459 }
2460
2461 // Create a new node set with the connected nodes of any successor of a node
2462 // in a recurrent set.
2463 NodeSet NewSet;
2464 SmallSetVector<SUnit *, 8> N;
2465 if (succ_L(NodesAdded, N, DDG.get()))
2466 for (SUnit *I : N)
2467 addConnectedNodes(I, NewSet, NodesAdded);
2468 if (!NewSet.empty())
2469 NodeSets.push_back(NewSet);
2470
2471 // Create a new node set with the connected nodes of any predecessor of a node
2472 // in a recurrent set.
2473 NewSet.clear();
2474 if (pred_L(NodesAdded, N, DDG.get()))
2475 for (SUnit *I : N)
2476 addConnectedNodes(I, NewSet, NodesAdded);
2477 if (!NewSet.empty())
2478 NodeSets.push_back(NewSet);
2479
2480 // Create new nodes sets with the connected nodes any remaining node that
2481 // has no predecessor.
2482 for (SUnit &SU : SUnits) {
2483 if (NodesAdded.count(&SU) == 0) {
2484 NewSet.clear();
2485 addConnectedNodes(&SU, NewSet, NodesAdded);
2486 if (!NewSet.empty())
2487 NodeSets.push_back(NewSet);
2488 }
2489 }
2490}
2491
2492/// Add the node to the set, and add all of its connected nodes to the set.
2493void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2494 SetVector<SUnit *> &NodesAdded) {
2495 NewSet.insert(SU);
2496 NodesAdded.insert(SU);
2497 for (auto &OE : DDG->getOutEdges(SU)) {
2498 SUnit *Successor = OE.getDst();
2499 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2500 NodesAdded.count(Successor) == 0)
2501 addConnectedNodes(Successor, NewSet, NodesAdded);
2502 }
2503 for (auto &IE : DDG->getInEdges(SU)) {
2504 SUnit *Predecessor = IE.getSrc();
2505 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2506 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2507 }
2508}
2509
2510/// Return true if Set1 contains elements in Set2. The elements in common
2511/// are returned in a different container.
2512static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2514 Result.clear();
2515 for (SUnit *SU : Set1) {
2516 if (Set2.count(SU) != 0)
2517 Result.insert(SU);
2518 }
2519 return !Result.empty();
2520}
2521
2522/// Merge the recurrence node sets that have the same initial node.
2523void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2524 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2525 ++I) {
2526 NodeSet &NI = *I;
2527 for (NodeSetType::iterator J = I + 1; J != E;) {
2528 NodeSet &NJ = *J;
2529 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2530 if (NJ.compareRecMII(NI) > 0)
2531 NI.setRecMII(NJ.getRecMII());
2532 for (SUnit *SU : *J)
2533 I->insert(SU);
2534 NodeSets.erase(J);
2535 E = NodeSets.end();
2536 } else {
2537 ++J;
2538 }
2539 }
2540 }
2541}
2542
2543/// Remove nodes that have been scheduled in previous NodeSets.
2544void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2545 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2546 ++I)
2547 for (NodeSetType::iterator J = I + 1; J != E;) {
2548 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2549
2550 if (J->empty()) {
2551 NodeSets.erase(J);
2552 E = NodeSets.end();
2553 } else {
2554 ++J;
2555 }
2556 }
2557}
2558
2559/// Compute an ordered list of the dependence graph nodes, which
2560/// indicates the order that the nodes will be scheduled. This is a
2561/// two-level algorithm. First, a partial order is created, which
2562/// consists of a list of sets ordered from highest to lowest priority.
2563void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2564 SmallSetVector<SUnit *, 8> R;
2565 NodeOrder.clear();
2566
2567 for (auto &Nodes : NodeSets) {
2568 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2569 OrderKind Order;
2570 SmallSetVector<SUnit *, 8> N;
2571 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2572 R.insert_range(N);
2573 Order = BottomUp;
2574 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2575 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2576 llvm::set_is_subset(N, Nodes)) {
2577 R.insert_range(N);
2578 Order = TopDown;
2579 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2580 } else if (isIntersect(N, Nodes, R)) {
2581 // If some of the successors are in the existing node-set, then use the
2582 // top-down ordering.
2583 Order = TopDown;
2584 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2585 } else if (NodeSets.size() == 1) {
2586 for (const auto &N : Nodes)
2587 if (N->Succs.size() == 0)
2588 R.insert(N);
2589 Order = BottomUp;
2590 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2591 } else {
2592 // Find the node with the highest ASAP.
2593 SUnit *maxASAP = nullptr;
2594 for (SUnit *SU : Nodes) {
2595 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2596 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2597 maxASAP = SU;
2598 }
2599 R.insert(maxASAP);
2600 Order = BottomUp;
2601 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2602 }
2603
2604 while (!R.empty()) {
2605 if (Order == TopDown) {
2606 // Choose the node with the maximum height. If more than one, choose
2607 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2608 // choose the node with the lowest MOV.
2609 while (!R.empty()) {
2610 SUnit *maxHeight = nullptr;
2611 for (SUnit *I : R) {
2612 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2613 maxHeight = I;
2614 else if (getHeight(I) == getHeight(maxHeight) &&
2615 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2616 maxHeight = I;
2617 else if (getHeight(I) == getHeight(maxHeight) &&
2618 getZeroLatencyHeight(I) ==
2619 getZeroLatencyHeight(maxHeight) &&
2620 getMOV(I) < getMOV(maxHeight))
2621 maxHeight = I;
2622 }
2623 NodeOrder.insert(maxHeight);
2624 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2625 R.remove(maxHeight);
2626 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2627 SUnit *SU = OE.getDst();
2628 if (Nodes.count(SU) == 0)
2629 continue;
2630 if (NodeOrder.contains(SU))
2631 continue;
2632 if (OE.ignoreDependence(false))
2633 continue;
2634 R.insert(SU);
2635 }
2636
2637 // FIXME: The following loop-carried dependencies may also need to be
2638 // considered.
2639 // - Physical register dependnecies (true-dependnece and WAW).
2640 // - Memory dependencies.
2641 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2642 SUnit *SU = IE.getSrc();
2643 if (!IE.isAntiDep())
2644 continue;
2645 if (Nodes.count(SU) == 0)
2646 continue;
2647 if (NodeOrder.contains(SU))
2648 continue;
2649 R.insert(SU);
2650 }
2651 }
2652 Order = BottomUp;
2653 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2654 SmallSetVector<SUnit *, 8> N;
2655 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2656 R.insert_range(N);
2657 } else {
2658 // Choose the node with the maximum depth. If more than one, choose
2659 // the node with the maximum ZeroLatencyDepth. If still more than one,
2660 // choose the node with the lowest MOV.
2661 while (!R.empty()) {
2662 SUnit *maxDepth = nullptr;
2663 for (SUnit *I : R) {
2664 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2665 maxDepth = I;
2666 else if (getDepth(I) == getDepth(maxDepth) &&
2667 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2668 maxDepth = I;
2669 else if (getDepth(I) == getDepth(maxDepth) &&
2670 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2671 getMOV(I) < getMOV(maxDepth))
2672 maxDepth = I;
2673 }
2674 NodeOrder.insert(maxDepth);
2675 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2676 R.remove(maxDepth);
2677 if (Nodes.isExceedSU(maxDepth)) {
2678 Order = TopDown;
2679 R.clear();
2680 R.insert(Nodes.getNode(0));
2681 break;
2682 }
2683 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2684 SUnit *SU = IE.getSrc();
2685 if (Nodes.count(SU) == 0)
2686 continue;
2687 if (NodeOrder.contains(SU))
2688 continue;
2689 R.insert(SU);
2690 }
2691
2692 // FIXME: The following loop-carried dependencies may also need to be
2693 // considered.
2694 // - Physical register dependnecies (true-dependnece and WAW).
2695 // - Memory dependencies.
2696 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2697 SUnit *SU = OE.getDst();
2698 if (!OE.isAntiDep())
2699 continue;
2700 if (Nodes.count(SU) == 0)
2701 continue;
2702 if (NodeOrder.contains(SU))
2703 continue;
2704 R.insert(SU);
2705 }
2706 }
2707 Order = TopDown;
2708 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2709 SmallSetVector<SUnit *, 8> N;
2710 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2711 R.insert_range(N);
2712 }
2713 }
2714 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2715 }
2716
2717 LLVM_DEBUG({
2718 dbgs() << "Node order: ";
2719 for (SUnit *I : NodeOrder)
2720 dbgs() << " " << I->NodeNum << " ";
2721 dbgs() << "\n";
2722 });
2723}
2724
2725/// Process the nodes in the computed order and create the pipelined schedule
2726/// of the instructions, if possible. Return true if a schedule is found.
2727bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2728
2729 if (NodeOrder.empty()){
2730 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2731 return false;
2732 }
2733
2734 bool scheduleFound = false;
2735 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2736 if (LimitRegPressure) {
2737 HRPDetector =
2738 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2739 HRPDetector->init(RegClassInfo);
2740 }
2741 // Keep increasing II until a valid schedule is found.
2742 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2743 Schedule.reset();
2744 Schedule.setInitiationInterval(II);
2745 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2746
2749 do {
2750 SUnit *SU = *NI;
2751
2752 // Compute the schedule time for the instruction, which is based
2753 // upon the scheduled time for any predecessors/successors.
2754 int EarlyStart = INT_MIN;
2755 int LateStart = INT_MAX;
2756 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2757 LLVM_DEBUG({
2758 dbgs() << "\n";
2759 dbgs() << "Inst (" << SU->NodeNum << ") ";
2760 SU->getInstr()->dump();
2761 dbgs() << "\n";
2762 });
2763 LLVM_DEBUG(
2764 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2765
2766 if (EarlyStart > LateStart)
2767 scheduleFound = false;
2768 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2769 scheduleFound =
2770 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2771 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2772 scheduleFound =
2773 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2774 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2775 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2776 // When scheduling a Phi it is better to start at the late cycle and
2777 // go backwards. The default order may insert the Phi too far away
2778 // from its first dependence.
2779 // Also, do backward search when all scheduled predecessors are
2780 // loop-carried output/order dependencies. Empirically, there are also
2781 // cases where scheduling becomes possible with backward search.
2782 if (SU->getInstr()->isPHI() ||
2783 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2784 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2785 else
2786 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2787 } else {
2788 int FirstCycle = Schedule.getFirstCycle();
2789 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2790 FirstCycle + getASAP(SU) + II - 1, II);
2791 }
2792
2793 // Even if we find a schedule, make sure the schedule doesn't exceed the
2794 // allowable number of stages. We keep trying if this happens.
2795 if (scheduleFound)
2796 if (SwpMaxStages > -1 &&
2797 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2798 scheduleFound = false;
2799
2800 LLVM_DEBUG({
2801 if (!scheduleFound)
2802 dbgs() << "\tCan't schedule\n";
2803 });
2804 } while (++NI != NE && scheduleFound);
2805
2806 // If a schedule is found, validate it against the validation-only
2807 // dependencies.
2808 if (scheduleFound)
2809 scheduleFound = DDG->isValidSchedule(Schedule);
2810
2811 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2812 if (scheduleFound)
2813 scheduleFound =
2814 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2815
2816 // If a schedule is found, check if it is a valid schedule too.
2817 if (scheduleFound)
2818 scheduleFound = Schedule.isValidSchedule(this);
2819
2820 // If a schedule was found and the option is enabled, check if the schedule
2821 // might generate additional register spills/fills.
2822 if (scheduleFound && LimitRegPressure)
2823 scheduleFound =
2824 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2825 }
2826
2827 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2828 << " (II=" << Schedule.getInitiationInterval()
2829 << ")\n");
2830
2831 if (scheduleFound) {
2832 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2833 if (!scheduleFound)
2834 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2835 }
2836
2837 if (scheduleFound) {
2838 Schedule.finalizeSchedule(this);
2839 Pass.ORE->emit([&]() {
2840 return MachineOptimizationRemarkAnalysis(
2841 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2842 << "Schedule found with Initiation Interval: "
2843 << ore::NV("II", Schedule.getInitiationInterval())
2844 << ", MaxStageCount: "
2845 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2846 });
2847 } else
2848 Schedule.reset();
2849
2850 return scheduleFound && Schedule.getMaxStageCount() > 0;
2851}
2852
2854 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2855 Register Result;
2856 for (const MachineOperand &Use : MI.all_uses()) {
2857 Register Reg = Use.getReg();
2858 if (!Reg.isVirtual())
2859 return Register();
2860 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2861 continue;
2862 if (Result)
2863 return Register();
2864 Result = Reg;
2865 }
2866 return Result;
2867}
2868
2869/// When Op is a value that is incremented recursively in a loop and there is a
2870/// unique instruction that increments it, returns true and sets Value.
2872 if (!Op.isReg() || !Op.getReg().isVirtual())
2873 return false;
2874
2875 Register OrgReg = Op.getReg();
2876 Register CurReg = OrgReg;
2877 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2878 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2879
2880 const TargetInstrInfo *TII =
2881 LoopBB->getParent()->getSubtarget().getInstrInfo();
2882 const TargetRegisterInfo *TRI =
2883 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2884
2885 MachineInstr *Phi = nullptr;
2886 MachineInstr *Increment = nullptr;
2887
2888 // Traverse definitions until it reaches Op or an instruction that does not
2889 // satisfy the condition.
2890 // Acceptable example:
2891 // bb.0:
2892 // %0 = PHI %3, %bb.0, ...
2893 // %2 = ADD %0, Value
2894 // ... = LOAD %2(Op)
2895 // %3 = COPY %2
2896 while (true) {
2897 if (!CurReg.isValid() || !CurReg.isVirtual())
2898 return false;
2899 MachineInstr *Def = MRI.getVRegDef(CurReg);
2900 if (Def->getParent() != LoopBB)
2901 return false;
2902
2903 if (Def->isCopy()) {
2904 // Ignore copy instructions unless they contain subregisters
2905 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2906 return false;
2907 CurReg = Def->getOperand(1).getReg();
2908 } else if (Def->isPHI()) {
2909 // There must be just one Phi
2910 if (Phi)
2911 return false;
2912 Phi = Def;
2913 CurReg = getLoopPhiReg(*Def, LoopBB);
2914 } else if (TII->getIncrementValue(*Def, Value)) {
2915 // Potentially a unique increment
2916 if (Increment)
2917 // Multiple increments exist
2918 return false;
2919
2920 const MachineOperand *BaseOp;
2921 int64_t Offset;
2922 bool OffsetIsScalable;
2923 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2924 TRI)) {
2925 // Pre/post increment instruction
2926 CurReg = BaseOp->getReg();
2927 } else {
2928 // If only one of the operands is defined within the loop, it is assumed
2929 // to be an incremented value.
2930 CurReg = findUniqueOperandDefinedInLoop(*Def);
2931 if (!CurReg.isValid())
2932 return false;
2933 }
2934 Increment = Def;
2935 } else {
2936 return false;
2937 }
2938 if (CurReg == OrgReg)
2939 break;
2940 }
2941
2942 if (!Phi || !Increment)
2943 return false;
2944
2945 return true;
2946}
2947
2948/// Return true if we can compute the amount the instruction changes
2949/// during each iteration. Set Delta to the amount of the change.
2950bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2951 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2952 const MachineOperand *BaseOp;
2953 int64_t Offset;
2954 bool OffsetIsScalable;
2955 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2956 return false;
2957
2958 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2959 if (OffsetIsScalable)
2960 return false;
2961
2962 if (!BaseOp->isReg())
2963 return false;
2964
2965 return findLoopIncrementValue(*BaseOp, Delta);
2966}
2967
2968/// Check if we can change the instruction to use an offset value from the
2969/// previous iteration. If so, return true and set the base and offset values
2970/// so that we can rewrite the load, if necessary.
2971/// v1 = Phi(v0, v3)
2972/// v2 = load v1, 0
2973/// v3 = post_store v1, 4, x
2974/// This function enables the load to be rewritten as v2 = load v3, 4.
2975bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2976 unsigned &BasePos,
2977 unsigned &OffsetPos,
2978 Register &NewBase,
2979 int64_t &Offset) {
2980 // Get the load instruction.
2981 if (TII->isPostIncrement(*MI))
2982 return false;
2983 unsigned BasePosLd, OffsetPosLd;
2984 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2985 return false;
2986 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2987
2988 // Look for the Phi instruction.
2989 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2990 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2991 if (!Phi || !Phi->isPHI())
2992 return false;
2993 // Get the register defined in the loop block.
2994 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2995 if (!PrevReg)
2996 return false;
2997
2998 // Check for the post-increment load/store instruction.
2999 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3000 if (!PrevDef || PrevDef == MI)
3001 return false;
3002
3003 if (!TII->isPostIncrement(*PrevDef))
3004 return false;
3005
3006 unsigned BasePos1 = 0, OffsetPos1 = 0;
3007 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3008 return false;
3009
3010 // Make sure that the instructions do not access the same memory location in
3011 // the next iteration.
3012 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3013 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3014 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3015 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3016 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3017 MF.deleteMachineInstr(NewMI);
3018 if (!Disjoint)
3019 return false;
3020
3021 // Set the return value once we determine that we return true.
3022 BasePos = BasePosLd;
3023 OffsetPos = OffsetPosLd;
3024 NewBase = PrevReg;
3025 Offset = StoreOffset;
3026 return true;
3027}
3028
3029/// Apply changes to the instruction if needed. The changes are need
3030/// to improve the scheduling and depend up on the final schedule.
3032 SMSchedule &Schedule) {
3033 SUnit *SU = getSUnit(MI);
3035 InstrChanges.find(SU);
3036 if (It != InstrChanges.end()) {
3037 std::pair<Register, int64_t> RegAndOffset = It->second;
3038 unsigned BasePos, OffsetPos;
3039 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3040 return;
3041 Register BaseReg = MI->getOperand(BasePos).getReg();
3042 MachineInstr *LoopDef = findDefInLoop(BaseReg);
3043 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3044 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3045 int BaseStageNum = Schedule.stageScheduled(SU);
3046 int BaseCycleNum = Schedule.cycleScheduled(SU);
3047 if (BaseStageNum < DefStageNum) {
3048 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3049 int OffsetDiff = DefStageNum - BaseStageNum;
3050 if (DefCycleNum < BaseCycleNum) {
3051 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3052 if (OffsetDiff > 0)
3053 --OffsetDiff;
3054 }
3055 int64_t NewOffset =
3056 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3057 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3058 SU->setInstr(NewMI);
3059 MISUnitMap[NewMI] = SU;
3060 NewMIs[MI] = NewMI;
3061 }
3062 }
3063}
3064
3065/// Return the instruction in the loop that defines the register.
3066/// If the definition is a Phi, then follow the Phi operand to
3067/// the instruction in the loop.
3068MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3070 MachineInstr *Def = MRI.getVRegDef(Reg);
3071 while (Def->isPHI()) {
3072 if (!Visited.insert(Def).second)
3073 break;
3074 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3075 if (Def->getOperand(i + 1).getMBB() == BB) {
3076 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3077 break;
3078 }
3079 }
3080 return Def;
3081}
3082
3083/// Return false if there is no overlap between the region accessed by BaseMI in
3084/// an iteration and the region accessed by OtherMI in subsequent iterations.
3086 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3087 int DeltaB, DeltaO, Delta;
3088 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3089 DeltaB != DeltaO)
3090 return true;
3091 Delta = DeltaB;
3092
3093 const MachineOperand *BaseOpB, *BaseOpO;
3094 int64_t OffsetB, OffsetO;
3095 bool OffsetBIsScalable, OffsetOIsScalable;
3096 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3097 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3098 OffsetBIsScalable, TRI) ||
3099 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3100 OffsetOIsScalable, TRI))
3101 return true;
3102
3103 if (OffsetBIsScalable || OffsetOIsScalable)
3104 return true;
3105
3106 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3107 // Pass cases with different base operands but same initial values.
3108 // Typically for when pre/post increment is used.
3109
3110 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3111 return true;
3112 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3113 if (!RegB.isVirtual() || !RegO.isVirtual())
3114 return true;
3115
3116 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3117 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3118 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3119 return true;
3120
3121 Register InitValB;
3122 Register LoopValB;
3123 Register InitValO;
3124 Register LoopValO;
3125 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3126 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3127 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3128 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3129
3130 if (!InitDefB->isIdenticalTo(*InitDefO))
3131 return true;
3132 }
3133
3134 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3135 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3136
3137 // This is the main test, which checks the offset values and the loop
3138 // increment value to determine if the accesses may be loop carried.
3139 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3140 return true;
3141
3142 LLVM_DEBUG({
3143 dbgs() << "Overlap check:\n";
3144 dbgs() << " BaseMI: ";
3145 BaseMI->dump();
3146 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3147 << ", Len: " << AccessSizeB.getValue() << "\n";
3148 dbgs() << " OtherMI: ";
3149 OtherMI->dump();
3150 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3151 << ", Len: " << AccessSizeO.getValue() << "\n";
3152 });
3153
3154 // Excessive overlap may be detected in strided patterns.
3155 // For example, the memory addresses of the store and the load in
3156 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3157 // are assumed to overlap.
3158 if (Delta < 0) {
3159 int64_t BaseMinAddr = OffsetB;
3160 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3161 if (BaseMinAddr > OhterNextIterMaxAddr) {
3162 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3163 return false;
3164 }
3165 } else {
3166 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3167 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3168 if (BaseMaxAddr < OtherNextIterMinAddr) {
3169 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3170 return false;
3171 }
3172 }
3173 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3174 return true;
3175}
3176
3177/// Return true for an order or output dependence that is loop carried
3178/// potentially. A dependence is loop carried if the destination defines a value
3179/// that may be used or defined by the source in a subsequent iteration.
3181 const SwingSchedulerDDGEdge &Edge) const {
3182 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3183 Edge.getDst()->isBoundaryNode())
3184 return false;
3185
3187 return true;
3188
3189 if (Edge.isOutputDep())
3190 return true;
3191
3192 MachineInstr *SI = Edge.getSrc()->getInstr();
3193 MachineInstr *DI = Edge.getDst()->getInstr();
3194 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3195
3196 // Assume ordered loads and stores may have a loop carried dependence.
3197 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3198 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3199 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3200 return true;
3201
3202 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3203 return false;
3204
3205 // The conservative assumption is that a dependence between memory operations
3206 // may be loop carried. The following code checks when it can be proved that
3207 // there is no loop carried dependence.
3208 return mayOverlapInLaterIter(DI, SI);
3209}
3210
3211void SwingSchedulerDAG::postProcessDAG() {
3212 for (auto &M : Mutations)
3213 M->apply(this);
3214}
3215
3216/// Try to schedule the node at the specified StartCycle and continue
3217/// until the node is schedule or the EndCycle is reached. This function
3218/// returns true if the node is scheduled. This routine may search either
3219/// forward or backward for a place to insert the instruction based upon
3220/// the relative values of StartCycle and EndCycle.
3221bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3222 bool forward = true;
3223 LLVM_DEBUG({
3224 dbgs() << "Trying to insert node between " << StartCycle << " and "
3225 << EndCycle << " II: " << II << "\n";
3226 });
3227 if (StartCycle > EndCycle)
3228 forward = false;
3229
3230 // The terminating condition depends on the direction.
3231 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3232 for (int curCycle = StartCycle; curCycle != termCycle;
3233 forward ? ++curCycle : --curCycle) {
3234
3235 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3236 ProcItinResources.canReserveResources(*SU, curCycle)) {
3237 LLVM_DEBUG({
3238 dbgs() << "\tinsert at cycle " << curCycle << " ";
3239 SU->getInstr()->dump();
3240 });
3241
3242 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3243 ProcItinResources.reserveResources(*SU, curCycle);
3244 ScheduledInstrs[curCycle].push_back(SU);
3245 InstrToCycle.insert(std::make_pair(SU, curCycle));
3246 if (curCycle > LastCycle)
3247 LastCycle = curCycle;
3248 if (curCycle < FirstCycle)
3249 FirstCycle = curCycle;
3250 return true;
3251 }
3252 LLVM_DEBUG({
3253 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3254 SU->getInstr()->dump();
3255 });
3256 }
3257 return false;
3258}
3259
3260// Return the cycle of the earliest scheduled instruction in the chain.
3262 const SwingSchedulerDDG *DDG) {
3265 Worklist.push_back(Dep);
3266 int EarlyCycle = INT_MAX;
3267 while (!Worklist.empty()) {
3268 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3269 SUnit *PrevSU = Cur.getSrc();
3270 if (Visited.count(PrevSU))
3271 continue;
3272 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3273 if (it == InstrToCycle.end())
3274 continue;
3275 EarlyCycle = std::min(EarlyCycle, it->second);
3276 for (const auto &IE : DDG->getInEdges(PrevSU))
3277 if (IE.isOrderDep() || IE.isOutputDep())
3278 Worklist.push_back(IE);
3279 Visited.insert(PrevSU);
3280 }
3281 return EarlyCycle;
3282}
3283
3284// Return the cycle of the latest scheduled instruction in the chain.
3286 const SwingSchedulerDDG *DDG) {
3289 Worklist.push_back(Dep);
3290 int LateCycle = INT_MIN;
3291 while (!Worklist.empty()) {
3292 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3293 SUnit *SuccSU = Cur.getDst();
3294 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3295 continue;
3296 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3297 if (it == InstrToCycle.end())
3298 continue;
3299 LateCycle = std::max(LateCycle, it->second);
3300 for (const auto &OE : DDG->getOutEdges(SuccSU))
3301 if (OE.isOrderDep() || OE.isOutputDep())
3302 Worklist.push_back(OE);
3303 Visited.insert(SuccSU);
3304 }
3305 return LateCycle;
3306}
3307
3308/// If an instruction has a use that spans multiple iterations, then
3309/// return true. These instructions are characterized by having a back-ege
3310/// to a Phi, which contains a reference to another Phi.
3312 for (auto &P : SU->Preds)
3313 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3314 for (auto &S : P.getSUnit()->Succs)
3315 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3316 return P.getSUnit();
3317 return nullptr;
3318}
3319
3320/// Compute the scheduling start slot for the instruction. The start slot
3321/// depends on any predecessor or successor nodes scheduled already.
3322void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3323 int II, SwingSchedulerDAG *DAG) {
3324 const SwingSchedulerDDG *DDG = DAG->getDDG();
3325
3326 // Iterate over each instruction that has been scheduled already. The start
3327 // slot computation depends on whether the previously scheduled instruction
3328 // is a predecessor or successor of the specified instruction.
3329 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3330 for (SUnit *I : getInstructions(cycle)) {
3331 for (const auto &IE : DDG->getInEdges(SU)) {
3332 if (IE.getSrc() == I) {
3333 // FIXME: Add reverse edge to `DDG` instead of calling
3334 // `isLoopCarriedDep`
3335 if (DAG->isLoopCarriedDep(IE)) {
3336 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3337 *MinLateStart = std::min(*MinLateStart, End);
3338 }
3339 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3340 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3341 }
3342 }
3343
3344 for (const auto &OE : DDG->getOutEdges(SU)) {
3345 if (OE.getDst() == I) {
3346 // FIXME: Add reverse edge to `DDG` instead of calling
3347 // `isLoopCarriedDep`
3348 if (DAG->isLoopCarriedDep(OE)) {
3349 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3350 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3351 }
3352 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3353 *MinLateStart = std::min(*MinLateStart, LateStart);
3354 }
3355 }
3356
3357 SUnit *BE = multipleIterations(I, DAG);
3358 for (const auto &Dep : SU->Preds) {
3359 // For instruction that requires multiple iterations, make sure that
3360 // the dependent instruction is not scheduled past the definition.
3361 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3362 !SU->isPred(I))
3363 *MinLateStart = std::min(*MinLateStart, cycle);
3364 }
3365 }
3366 }
3367}
3368
3369/// Order the instructions within a cycle so that the definitions occur
3370/// before the uses. Returns true if the instruction is added to the start
3371/// of the list, or false if added to the end.
3373 std::deque<SUnit *> &Insts) const {
3374 MachineInstr *MI = SU->getInstr();
3375 bool OrderBeforeUse = false;
3376 bool OrderAfterDef = false;
3377 bool OrderBeforeDef = false;
3378 unsigned MoveDef = 0;
3379 unsigned MoveUse = 0;
3380 int StageInst1 = stageScheduled(SU);
3381 const SwingSchedulerDDG *DDG = SSD->getDDG();
3382
3383 unsigned Pos = 0;
3384 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3385 ++I, ++Pos) {
3386 for (MachineOperand &MO : MI->operands()) {
3387 if (!MO.isReg() || !MO.getReg().isVirtual())
3388 continue;
3389
3390 Register Reg = MO.getReg();
3391 unsigned BasePos, OffsetPos;
3392 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3393 if (MI->getOperand(BasePos).getReg() == Reg)
3394 if (Register NewReg = SSD->getInstrBaseReg(SU))
3395 Reg = NewReg;
3396 bool Reads, Writes;
3397 std::tie(Reads, Writes) =
3398 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3399 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3400 OrderBeforeUse = true;
3401 if (MoveUse == 0)
3402 MoveUse = Pos;
3403 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3404 // Add the instruction after the scheduled instruction.
3405 OrderAfterDef = true;
3406 MoveDef = Pos;
3407 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3408 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3409 OrderBeforeUse = true;
3410 if (MoveUse == 0)
3411 MoveUse = Pos;
3412 } else {
3413 OrderAfterDef = true;
3414 MoveDef = Pos;
3415 }
3416 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3417 OrderBeforeUse = true;
3418 if (MoveUse == 0)
3419 MoveUse = Pos;
3420 if (MoveUse != 0) {
3421 OrderAfterDef = true;
3422 MoveDef = Pos - 1;
3423 }
3424 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3425 // Add the instruction before the scheduled instruction.
3426 OrderBeforeUse = true;
3427 if (MoveUse == 0)
3428 MoveUse = Pos;
3429 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3430 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3431 if (MoveUse == 0) {
3432 OrderBeforeDef = true;
3433 MoveUse = Pos;
3434 }
3435 }
3436 }
3437 // Check for order dependences between instructions. Make sure the source
3438 // is ordered before the destination.
3439 for (auto &OE : DDG->getOutEdges(SU)) {
3440 if (OE.getDst() != *I)
3441 continue;
3442 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3443 OrderBeforeUse = true;
3444 if (Pos < MoveUse)
3445 MoveUse = Pos;
3446 }
3447 // We did not handle HW dependences in previous for loop,
3448 // and we normally set Latency = 0 for Anti/Output deps,
3449 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3450 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3451 stageScheduled(*I) == StageInst1) {
3452 OrderBeforeUse = true;
3453 if ((MoveUse == 0) || (Pos < MoveUse))
3454 MoveUse = Pos;
3455 }
3456 }
3457 for (auto &IE : DDG->getInEdges(SU)) {
3458 if (IE.getSrc() != *I)
3459 continue;
3460 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3461 stageScheduled(*I) == StageInst1) {
3462 OrderAfterDef = true;
3463 MoveDef = Pos;
3464 }
3465 }
3466 }
3467
3468 // A circular dependence.
3469 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3470 OrderBeforeUse = false;
3471
3472 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3473 // to a loop-carried dependence.
3474 if (OrderBeforeDef)
3475 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3476
3477 // The uncommon case when the instruction order needs to be updated because
3478 // there is both a use and def.
3479 if (OrderBeforeUse && OrderAfterDef) {
3480 SUnit *UseSU = Insts.at(MoveUse);
3481 SUnit *DefSU = Insts.at(MoveDef);
3482 if (MoveUse > MoveDef) {
3483 Insts.erase(Insts.begin() + MoveUse);
3484 Insts.erase(Insts.begin() + MoveDef);
3485 } else {
3486 Insts.erase(Insts.begin() + MoveDef);
3487 Insts.erase(Insts.begin() + MoveUse);
3488 }
3489 orderDependence(SSD, UseSU, Insts);
3490 orderDependence(SSD, SU, Insts);
3491 orderDependence(SSD, DefSU, Insts);
3492 return;
3493 }
3494 // Put the new instruction first if there is a use in the list. Otherwise,
3495 // put it at the end of the list.
3496 if (OrderBeforeUse)
3497 Insts.push_front(SU);
3498 else
3499 Insts.push_back(SU);
3500}
3501
3502/// Return true if the scheduled Phi has a loop carried operand.
3504 MachineInstr &Phi) const {
3505 if (!Phi.isPHI())
3506 return false;
3507 assert(Phi.isPHI() && "Expecting a Phi.");
3508 SUnit *DefSU = SSD->getSUnit(&Phi);
3509 unsigned DefCycle = cycleScheduled(DefSU);
3510 int DefStage = stageScheduled(DefSU);
3511
3512 Register InitVal;
3513 Register LoopVal;
3514 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3515 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3516 if (!UseSU)
3517 return true;
3518 if (UseSU->getInstr()->isPHI())
3519 return true;
3520 unsigned LoopCycle = cycleScheduled(UseSU);
3521 int LoopStage = stageScheduled(UseSU);
3522 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3523}
3524
3525/// Return true if the instruction is a definition that is loop carried
3526/// and defines the use on the next iteration.
3527/// v1 = phi(v2, v3)
3528/// (Def) v3 = op v1
3529/// (MO) = v1
3530/// If MO appears before Def, then v1 and v3 may get assigned to the same
3531/// register.
3533 MachineInstr *Def,
3534 MachineOperand &MO) const {
3535 if (!MO.isReg())
3536 return false;
3537 if (Def->isPHI())
3538 return false;
3539 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3540 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3541 return false;
3542 if (!isLoopCarried(SSD, *Phi))
3543 return false;
3544 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3545 for (MachineOperand &DMO : Def->all_defs()) {
3546 if (DMO.getReg() == LoopReg)
3547 return true;
3548 }
3549 return false;
3550}
3551
3552/// Return true if all scheduled predecessors are loop-carried output/order
3553/// dependencies.
3555 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3556 for (const auto &IE : DDG->getInEdges(SU))
3557 if (InstrToCycle.count(IE.getSrc()))
3558 return false;
3559 return true;
3560}
3561
3562/// Determine transitive dependences of unpipelineable instructions
3565 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3566 SmallVector<SUnit *, 8> Worklist;
3567
3568 for (auto &SU : SSD->SUnits)
3569 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3570 Worklist.push_back(&SU);
3571
3572 const SwingSchedulerDDG *DDG = SSD->getDDG();
3573 while (!Worklist.empty()) {
3574 auto SU = Worklist.pop_back_val();
3575 if (DoNotPipeline.count(SU))
3576 continue;
3577 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3578 DoNotPipeline.insert(SU);
3579 for (const auto &IE : DDG->getInEdges(SU))
3580 Worklist.push_back(IE.getSrc());
3581
3582 // To preserve previous behavior and prevent regression
3583 // FIXME: Remove if this doesn't have significant impact on
3584 for (const auto &OE : DDG->getOutEdges(SU))
3585 if (OE.getDistance() == 1)
3586 Worklist.push_back(OE.getDst());
3587 }
3588 return DoNotPipeline;
3589}
3590
3591// Determine all instructions upon which any unpipelineable instruction depends
3592// and ensure that they are in stage 0. If unable to do so, return false.
3596
3597 int NewLastCycle = INT_MIN;
3598 for (SUnit &SU : SSD->SUnits) {
3599 if (!SU.isInstr())
3600 continue;
3601 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3602 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3603 continue;
3604 }
3605
3606 // Put the non-pipelined instruction as early as possible in the schedule
3607 int NewCycle = getFirstCycle();
3608 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3609 if (IE.getDistance() == 0)
3610 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3611
3612 // To preserve previous behavior and prevent regression
3613 // FIXME: Remove if this doesn't have significant impact on performance
3614 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3615 if (OE.getDistance() == 1)
3616 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3617
3618 int OldCycle = InstrToCycle[&SU];
3619 if (OldCycle != NewCycle) {
3620 InstrToCycle[&SU] = NewCycle;
3621 auto &OldS = getInstructions(OldCycle);
3622 llvm::erase(OldS, &SU);
3623 getInstructions(NewCycle).emplace_back(&SU);
3624 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3625 << ") is not pipelined; moving from cycle " << OldCycle
3626 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3627 }
3628
3629 // We traverse the SUs in the order of the original basic block. Computing
3630 // NewCycle in this order normally works fine because all dependencies
3631 // (except for loop-carried dependencies) don't violate the original order.
3632 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3633 // break it. That is, there may be exist an artificial dependency from
3634 // bottom to top. In such a case, NewCycle may become too large to be
3635 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3636 // following case:
3637 //
3638 // | Inst0 <-+
3639 // SU order | | artificial dep
3640 // | Inst1 --+
3641 // v
3642 //
3643 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3644 // Inst0 must be greater than or equal to N so that Inst0 is not be
3645 // scheduled at Stage 0. In such cases, we reject this schedule at this
3646 // time.
3647 // FIXME: The reason for this is the existence of artificial dependencies
3648 // that are contradict to the original SU order. If ignoring artificial
3649 // dependencies does not affect correctness, then it is better to ignore
3650 // them.
3651 if (FirstCycle + InitiationInterval <= NewCycle)
3652 return false;
3653
3654 NewLastCycle = std::max(NewLastCycle, NewCycle);
3655 }
3656 LastCycle = NewLastCycle;
3657 return true;
3658}
3659
3660// Check if the generated schedule is valid. This function checks if
3661// an instruction that uses a physical register is scheduled in a
3662// different stage than the definition. The pipeliner does not handle
3663// physical register values that may cross a basic block boundary.
3664// Furthermore, if a physical def/use pair is assigned to the same
3665// cycle, orderDependence does not guarantee def/use ordering, so that
3666// case should be considered invalid. (The test checks for both
3667// earlier and same-cycle use to be more robust.)
3669 for (SUnit &SU : SSD->SUnits) {
3670 if (!SU.hasPhysRegDefs)
3671 continue;
3672 int StageDef = stageScheduled(&SU);
3673 int CycleDef = InstrToCycle[&SU];
3674 assert(StageDef != -1 && "Instruction should have been scheduled.");
3675 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3676 SUnit *Dst = OE.getDst();
3677 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3678 if (OE.getReg().isPhysical()) {
3679 if (stageScheduled(Dst) != StageDef)
3680 return false;
3681 if (InstrToCycle[Dst] <= CycleDef)
3682 return false;
3683 }
3684 }
3685 }
3686 return true;
3687}
3688
3689/// A property of the node order in swing-modulo-scheduling is
3690/// that for nodes outside circuits the following holds:
3691/// none of them is scheduled after both a successor and a
3692/// predecessor.
3693/// The method below checks whether the property is met.
3694/// If not, debug information is printed and statistics information updated.
3695/// Note that we do not use an assert statement.
3696/// The reason is that although an invalid node order may prevent
3697/// the pipeliner from finding a pipelined schedule for arbitrary II,
3698/// it does not lead to the generation of incorrect code.
3699void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3700
3701 // a sorted vector that maps each SUnit to its index in the NodeOrder
3702 typedef std::pair<SUnit *, unsigned> UnitIndex;
3703 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3704
3705 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3706 Indices.push_back(std::make_pair(NodeOrder[i], i));
3707
3708 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3709 return std::get<0>(i1) < std::get<0>(i2);
3710 };
3711
3712 // sort, so that we can perform a binary search
3713 llvm::sort(Indices, CompareKey);
3714
3715 bool Valid = true;
3716 (void)Valid;
3717 // for each SUnit in the NodeOrder, check whether
3718 // it appears after both a successor and a predecessor
3719 // of the SUnit. If this is the case, and the SUnit
3720 // is not part of circuit, then the NodeOrder is not
3721 // valid.
3722 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3723 SUnit *SU = NodeOrder[i];
3724 unsigned Index = i;
3725
3726 bool PredBefore = false;
3727 bool SuccBefore = false;
3728
3729 SUnit *Succ;
3730 SUnit *Pred;
3731 (void)Succ;
3732 (void)Pred;
3733
3734 for (const auto &IE : DDG->getInEdges(SU)) {
3735 SUnit *PredSU = IE.getSrc();
3736 unsigned PredIndex = std::get<1>(
3737 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3738 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3739 PredBefore = true;
3740 Pred = PredSU;
3741 break;
3742 }
3743 }
3744
3745 for (const auto &OE : DDG->getOutEdges(SU)) {
3746 SUnit *SuccSU = OE.getDst();
3747 // Do not process a boundary node, it was not included in NodeOrder,
3748 // hence not in Indices either, call to std::lower_bound() below will
3749 // return Indices.end().
3750 if (SuccSU->isBoundaryNode())
3751 continue;
3752 unsigned SuccIndex = std::get<1>(
3753 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3754 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3755 SuccBefore = true;
3756 Succ = SuccSU;
3757 break;
3758 }
3759 }
3760
3761 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3762 // instructions in circuits are allowed to be scheduled
3763 // after both a successor and predecessor.
3764 bool InCircuit = llvm::any_of(
3765 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3766 if (InCircuit)
3767 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3768 else {
3769 Valid = false;
3770 NumNodeOrderIssues++;
3771 LLVM_DEBUG(dbgs() << "Predecessor ");
3772 }
3773 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3774 << " are scheduled before node " << SU->NodeNum
3775 << "\n");
3776 }
3777 }
3778
3779 LLVM_DEBUG({
3780 if (!Valid)
3781 dbgs() << "Invalid node order found!\n";
3782 });
3783}
3784
3785/// Attempt to fix the degenerate cases when the instruction serialization
3786/// causes the register lifetimes to overlap. For example,
3787/// p' = store_pi(p, b)
3788/// = load p, offset
3789/// In this case p and p' overlap, which means that two registers are needed.
3790/// Instead, this function changes the load to use p' and updates the offset.
3791void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3792 Register OverlapReg;
3793 Register NewBaseReg;
3794 for (SUnit *SU : Instrs) {
3795 MachineInstr *MI = SU->getInstr();
3796 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3797 const MachineOperand &MO = MI->getOperand(i);
3798 // Look for an instruction that uses p. The instruction occurs in the
3799 // same cycle but occurs later in the serialized order.
3800 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3801 // Check that the instruction appears in the InstrChanges structure,
3802 // which contains instructions that can have the offset updated.
3804 InstrChanges.find(SU);
3805 if (It != InstrChanges.end()) {
3806 unsigned BasePos, OffsetPos;
3807 // Update the base register and adjust the offset.
3808 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3809 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3810 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3811 int64_t NewOffset =
3812 MI->getOperand(OffsetPos).getImm() - It->second.second;
3813 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3814 SU->setInstr(NewMI);
3815 MISUnitMap[NewMI] = SU;
3816 NewMIs[MI] = NewMI;
3817 }
3818 }
3819 OverlapReg = Register();
3820 NewBaseReg = Register();
3821 break;
3822 }
3823 // Look for an instruction of the form p' = op(p), which uses and defines
3824 // two virtual registers that get allocated to the same physical register.
3825 unsigned TiedUseIdx = 0;
3826 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3827 // OverlapReg is p in the example above.
3828 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3829 // NewBaseReg is p' in the example above.
3830 NewBaseReg = MI->getOperand(i).getReg();
3831 break;
3832 }
3833 }
3834 }
3835}
3836
3837std::deque<SUnit *>
3839 const std::deque<SUnit *> &Instrs) const {
3840 std::deque<SUnit *> NewOrderPhi;
3841 for (SUnit *SU : Instrs) {
3842 if (SU->getInstr()->isPHI())
3843 NewOrderPhi.push_back(SU);
3844 }
3845 std::deque<SUnit *> NewOrderI;
3846 for (SUnit *SU : Instrs) {
3847 if (!SU->getInstr()->isPHI())
3848 orderDependence(SSD, SU, NewOrderI);
3849 }
3850 llvm::append_range(NewOrderPhi, NewOrderI);
3851 return NewOrderPhi;
3852}
3853
3854/// After the schedule has been formed, call this function to combine
3855/// the instructions from the different stages/cycles. That is, this
3856/// function creates a schedule that represents a single iteration.
3858 // Move all instructions to the first stage from later stages.
3859 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3860 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3861 ++stage) {
3862 std::deque<SUnit *> &cycleInstrs =
3863 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3864 for (SUnit *SU : llvm::reverse(cycleInstrs))
3865 ScheduledInstrs[cycle].push_front(SU);
3866 }
3867 }
3868
3869 // Erase all the elements in the later stages. Only one iteration should
3870 // remain in the scheduled list, and it contains all the instructions.
3871 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3872 ScheduledInstrs.erase(cycle);
3873
3874 // Change the registers in instruction as specified in the InstrChanges
3875 // map. We need to use the new registers to create the correct order.
3876 for (const SUnit &SU : SSD->SUnits)
3877 SSD->applyInstrChange(SU.getInstr(), *this);
3878
3879 // Reorder the instructions in each cycle to fix and improve the
3880 // generated code.
3881 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3882 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3883 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3884 SSD->fixupRegisterOverlaps(cycleInstrs);
3885 }
3886
3887 LLVM_DEBUG(dump(););
3888}
3889
3891 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3892 << " depth " << MaxDepth << " col " << Colocate << "\n";
3893 for (const auto &I : Nodes)
3894 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3895 os << "\n";
3896}
3897
3898#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3899/// Print the schedule information to the given output.
3901 // Iterate over each cycle.
3902 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3903 // Iterate over each instruction in the cycle.
3904 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3905 for (SUnit *CI : cycleInstrs->second) {
3906 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3907 os << "(" << CI->NodeNum << ") ";
3908 CI->getInstr()->print(os);
3909 os << "\n";
3910 }
3911 }
3912}
3913
3914/// Utility function used for debugging to print the schedule.
3917
3918void ResourceManager::dumpMRT() const {
3919 LLVM_DEBUG({
3920 if (UseDFA)
3921 return;
3922 std::stringstream SS;
3923 SS << "MRT:\n";
3924 SS << std::setw(4) << "Slot";
3925 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3926 SS << std::setw(3) << I;
3927 SS << std::setw(7) << "#Mops"
3928 << "\n";
3929 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3930 SS << std::setw(4) << Slot;
3931 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3932 SS << std::setw(3) << MRT[Slot][I];
3933 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3934 }
3935 dbgs() << SS.str();
3936 });
3937}
3938#endif
3939
3941 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3942 unsigned ProcResourceID = 0;
3943
3944 // We currently limit the resource kinds to 64 and below so that we can use
3945 // uint64_t for Masks
3946 assert(SM.getNumProcResourceKinds() < 64 &&
3947 "Too many kinds of resources, unsupported");
3948 // Create a unique bitmask for every processor resource unit.
3949 // Skip resource at index 0, since it always references 'InvalidUnit'.
3950 Masks.resize(SM.getNumProcResourceKinds());
3951 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3952 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3953 if (Desc.SubUnitsIdxBegin)
3954 continue;
3955 Masks[I] = 1ULL << ProcResourceID;
3956 ProcResourceID++;
3957 }
3958 // Create a unique bitmask for every processor resource group.
3959 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3960 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3961 if (!Desc.SubUnitsIdxBegin)
3962 continue;
3963 Masks[I] = 1ULL << ProcResourceID;
3964 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3965 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3966 ProcResourceID++;
3967 }
3968 LLVM_DEBUG({
3969 if (SwpShowResMask) {
3970 dbgs() << "ProcResourceDesc:\n";
3971 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3972 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3973 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3974 ProcResource->Name, I, Masks[I],
3975 ProcResource->NumUnits);
3976 }
3977 dbgs() << " -----------------\n";
3978 }
3979 });
3980}
3981
3983 LLVM_DEBUG({
3984 if (SwpDebugResource)
3985 dbgs() << "canReserveResources:\n";
3986 });
3987 if (UseDFA)
3988 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3989 ->canReserveResources(&SU.getInstr()->getDesc());
3990
3991 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3992 if (!SCDesc->isValid()) {
3993 LLVM_DEBUG({
3994 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3995 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3996 });
3997 return true;
3998 }
3999
4000 reserveResources(SCDesc, Cycle);
4001 bool Result = !isOverbooked();
4002 unreserveResources(SCDesc, Cycle);
4003
4004 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
4005 return Result;
4006}
4007
4008void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
4009 LLVM_DEBUG({
4010 if (SwpDebugResource)
4011 dbgs() << "reserveResources:\n";
4012 });
4013 if (UseDFA)
4014 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4015 ->reserveResources(&SU.getInstr()->getDesc());
4016
4017 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4018 if (!SCDesc->isValid()) {
4019 LLVM_DEBUG({
4020 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4021 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4022 });
4023 return;
4024 }
4025
4026 reserveResources(SCDesc, Cycle);
4027
4028 LLVM_DEBUG({
4029 if (SwpDebugResource) {
4030 dumpMRT();
4031 dbgs() << "reserveResources: done!\n\n";
4032 }
4033 });
4034}
4035
4036void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
4037 int Cycle) {
4038 assert(!UseDFA);
4039 for (const MCWriteProcResEntry &PRE : make_range(
4040 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4041 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4042 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4043
4044 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4045 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
4046}
4047
4048void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
4049 int Cycle) {
4050 assert(!UseDFA);
4051 for (const MCWriteProcResEntry &PRE : make_range(
4052 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4053 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4054 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4055
4056 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4057 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
4058}
4059
4060bool ResourceManager::isOverbooked() const {
4061 assert(!UseDFA);
4062 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
4063 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4064 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4065 if (MRT[Slot][I] > Desc->NumUnits)
4066 return true;
4067 }
4068 if (NumScheduledMops[Slot] > IssueWidth)
4069 return true;
4070 }
4071 return false;
4072}
4073
4074int ResourceManager::calculateResMIIDFA() const {
4075 assert(UseDFA);
4076
4077 // Sort the instructions by the number of available choices for scheduling,
4078 // least to most. Use the number of critical resources as the tie breaker.
4079 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4080 for (SUnit &SU : DAG->SUnits)
4081 FUS.calcCriticalResources(*SU.getInstr());
4082 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4083 FuncUnitOrder(FUS);
4084
4085 for (SUnit &SU : DAG->SUnits)
4086 FuncUnitOrder.push(SU.getInstr());
4087
4089 Resources.push_back(
4090 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4091
4092 while (!FuncUnitOrder.empty()) {
4093 MachineInstr *MI = FuncUnitOrder.top();
4094 FuncUnitOrder.pop();
4095 if (TII->isZeroCost(MI->getOpcode()))
4096 continue;
4097
4098 // Attempt to reserve the instruction in an existing DFA. At least one
4099 // DFA is needed for each cycle.
4100 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4101 unsigned ReservedCycles = 0;
4102 auto *RI = Resources.begin();
4103 auto *RE = Resources.end();
4104 LLVM_DEBUG({
4105 dbgs() << "Trying to reserve resource for " << NumCycles
4106 << " cycles for \n";
4107 MI->dump();
4108 });
4109 for (unsigned C = 0; C < NumCycles; ++C)
4110 while (RI != RE) {
4111 if ((*RI)->canReserveResources(*MI)) {
4112 (*RI)->reserveResources(*MI);
4113 ++ReservedCycles;
4114 break;
4115 }
4116 RI++;
4117 }
4118 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4119 << ", NumCycles:" << NumCycles << "\n");
4120 // Add new DFAs, if needed, to reserve resources.
4121 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4123 << "NewResource created to reserve resources"
4124 << "\n");
4125 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4126 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4127 NewResource->reserveResources(*MI);
4128 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4129 }
4130 }
4131
4132 int Resmii = Resources.size();
4133 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4134 return Resmii;
4135}
4136
4138 if (UseDFA)
4139 return calculateResMIIDFA();
4140
4141 // Count each resource consumption and divide it by the number of units.
4142 // ResMII is the max value among them.
4143
4144 int NumMops = 0;
4145 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4146 for (SUnit &SU : DAG->SUnits) {
4147 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4148 continue;
4149
4150 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4151 if (!SCDesc->isValid())
4152 continue;
4153
4154 LLVM_DEBUG({
4155 if (SwpDebugResource) {
4156 DAG->dumpNode(SU);
4157 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4158 << " WriteProcRes: ";
4159 }
4160 });
4161 NumMops += SCDesc->NumMicroOps;
4162 for (const MCWriteProcResEntry &PRE :
4163 make_range(STI->getWriteProcResBegin(SCDesc),
4164 STI->getWriteProcResEnd(SCDesc))) {
4165 LLVM_DEBUG({
4166 if (SwpDebugResource) {
4167 const MCProcResourceDesc *Desc =
4168 SM.getProcResource(PRE.ProcResourceIdx);
4169 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4170 }
4171 });
4172 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4173 }
4174 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4175 }
4176
4177 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4178 LLVM_DEBUG({
4179 if (SwpDebugResource)
4180 dbgs() << "#Mops: " << NumMops << ", "
4181 << "IssueWidth: " << IssueWidth << ", "
4182 << "Cycles: " << Result << "\n";
4183 });
4184
4185 LLVM_DEBUG({
4186 if (SwpDebugResource) {
4187 std::stringstream SS;
4188 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4189 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4190 << "\n";
4191 dbgs() << SS.str();
4192 }
4193 });
4194 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4195 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4196 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4197 LLVM_DEBUG({
4198 if (SwpDebugResource) {
4199 std::stringstream SS;
4200 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4201 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4202 << std::setw(10) << Cycles << "\n";
4203 dbgs() << SS.str();
4204 }
4205 });
4206 if (Cycles > Result)
4207 Result = Cycles;
4208 }
4209 return Result;
4210}
4211
4213 InitiationInterval = II;
4214 DFAResources.clear();
4215 DFAResources.resize(II);
4216 for (auto &I : DFAResources)
4217 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4218 MRT.clear();
4219 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4220 NumScheduledMops.clear();
4221 NumScheduledMops.resize(II);
4222}
4223
4224bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4225 if (Pred.isArtificial() || Dst->isBoundaryNode())
4226 return true;
4227 // Currently, dependence that is an anti-dependences but not a loop-carried is
4228 // also ignored. This behavior is preserved to prevent regression.
4229 // FIXME: Remove if this doesn't have significant impact on performance
4230 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4231}
4232
4233SwingSchedulerDDG::SwingSchedulerDDGEdges &
4234SwingSchedulerDDG::getEdges(const SUnit *SU) {
4235 if (SU == EntrySU)
4236 return EntrySUEdges;
4237 if (SU == ExitSU)
4238 return ExitSUEdges;
4239 return EdgesVec[SU->NodeNum];
4240}
4241
4242const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4243SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4244 if (SU == EntrySU)
4245 return EntrySUEdges;
4246 if (SU == ExitSU)
4247 return ExitSUEdges;
4248 return EdgesVec[SU->NodeNum];
4249}
4250
4251void SwingSchedulerDDG::addEdge(const SUnit *SU,
4252 const SwingSchedulerDDGEdge &Edge) {
4253 assert(!Edge.isValidationOnly() &&
4254 "Validation-only edges are not expected here.");
4255 auto &Edges = getEdges(SU);
4256 if (Edge.getSrc() == SU)
4257 Edges.Succs.push_back(Edge);
4258 else
4259 Edges.Preds.push_back(Edge);
4260}
4261
4262void SwingSchedulerDDG::initEdges(SUnit *SU) {
4263 for (const auto &PI : SU->Preds) {
4264 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4265 /*IsValidationOnly=*/false);
4266 addEdge(SU, Edge);
4267 }
4268
4269 for (const auto &SI : SU->Succs) {
4270 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4271 /*IsValidationOnly=*/false);
4272 addEdge(SU, Edge);
4273 }
4274}
4275
4276SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4277 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4278 : EntrySU(EntrySU), ExitSU(ExitSU) {
4279 EdgesVec.resize(SUnits.size());
4280
4281 // Add non-loop-carried edges based on the DAG.
4282 initEdges(EntrySU);
4283 initEdges(ExitSU);
4284 for (auto &SU : SUnits)
4285 initEdges(&SU);
4286
4287 // Add loop-carried edges, which are not represented in the DAG.
4288 for (SUnit &SU : SUnits) {
4289 SUnit *Src = &SU;
4290 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4291 SDep Base(Src, SDep::Barrier);
4292 Base.setLatency(1);
4293 for (SUnit *Dst : *OD) {
4294 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4295 /*IsValidationOnly=*/true);
4296 Edge.setDistance(1);
4297 ValidationOnlyEdges.push_back(Edge);
4298 }
4299 }
4300 }
4301}
4302
4303const SwingSchedulerDDG::EdgesType &
4305 return getEdges(SU).Preds;
4306}
4307
4308const SwingSchedulerDDG::EdgesType &
4310 return getEdges(SU).Succs;
4311}
4312
4313/// Check if \p Schedule doesn't violate the validation-only dependencies.
4315 unsigned II = Schedule.getInitiationInterval();
4316
4317 auto ExpandCycle = [&](SUnit *SU) {
4318 int Stage = Schedule.stageScheduled(SU);
4319 int Cycle = Schedule.cycleScheduled(SU);
4320 return Cycle + (Stage * II);
4321 };
4322
4323 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4324 SUnit *Src = Edge.getSrc();
4325 SUnit *Dst = Edge.getDst();
4326 if (!Src->isInstr() || !Dst->isInstr())
4327 continue;
4328 int CycleSrc = ExpandCycle(Src);
4329 int CycleDst = ExpandCycle(Dst);
4330 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4331 if (CycleSrc > MaxLateStart) {
4332 LLVM_DEBUG({
4333 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4334 << Dst->NodeNum << "\n";
4335 });
4336 return false;
4337 }
4338 }
4339 return true;
4340}
4341
4342void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4343 const TargetInstrInfo *TII) {
4344 for (SUnit &SU : SUnits) {
4345 SUnit *Src = &SU;
4346 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4347 SDep Dep(Src, SDep::Barrier);
4348 Dep.setLatency(1);
4349 for (SUnit *Dst : *OrderDep) {
4350 SUnit *From = Src;
4351 SUnit *To = Dst;
4352 if (From->NodeNum > To->NodeNum)
4353 std::swap(From, To);
4354
4355 // Add a forward edge if the following conditions are met:
4356 //
4357 // - The instruction of the source node (FromMI) may read memory.
4358 // - The instruction of the target node (ToMI) may modify memory, but
4359 // does not read it.
4360 // - Neither instruction is a global barrier.
4361 // - The load appears before the store in the original basic block.
4362 // - There are no barrier or store instructions between the two nodes.
4363 // - The target node is unreachable from the source node in the current
4364 // DAG.
4365 //
4366 // TODO: These conditions are inherited from a previous implementation,
4367 // and some may no longer be necessary. For now, we conservatively
4368 // retain all of them to avoid regressions, but the logic could
4369 // potentially be simplified
4370 MachineInstr *FromMI = From->getInstr();
4371 MachineInstr *ToMI = To->getInstr();
4372 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4373 !TII->isGlobalMemoryObject(FromMI) &&
4374 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4375 SDep Pred = Dep;
4376 Pred.setSUnit(From);
4377 To->addPred(Pred);
4378 }
4379 }
4380 }
4381 }
4382}
4383
4385 const MachineRegisterInfo *MRI) const {
4386 const auto *Order = getOrderDepOrNull(SU);
4387
4388 if (!Order)
4389 return;
4390
4391 const auto DumpSU = [](const SUnit *SU) {
4392 std::ostringstream OSS;
4393 OSS << "SU(" << SU->NodeNum << ")";
4394 return OSS.str();
4395 };
4396
4397 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4398 << " Order\n";
4399 for (SUnit *Dst : *Order)
4400 dbgs() << " " << DumpSU(Dst) << "\n";
4401}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:646
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:593
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool hasPHICycleDFS(unsigned Reg, const DenseMap< unsigned, SmallVector< unsigned, 2 > > &PhiDeps, SmallSet< unsigned, 8 > &Visited, SmallSet< unsigned, 8 > &RecStack)
Depth-first search to detect cycles among PHI dependencies.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool hasPHICycle(const MachineBasicBlock *LoopHeader, const MachineRegisterInfo &MRI)
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:624
MachineInstrBundleIterator< const MachineInstr > const_iterator
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
typename vector_type::const_iterator iterator
Definition SetVector.h:72
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr MCRegUnit asMCRegUnit() const
Definition Register.h:201
constexpr Register asVirtualReg() const
Definition Register.h:206
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ Valid
The data is already valid.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.