LLVM 22.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197/// A command line argument to limit the number of store instructions in the
198/// target basic block.
200 "pipeliner-max-num-stores",
201 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
202 cl::init(200));
203
204// A command line option to enable the CopyToPhi DAG mutation.
206 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
207 cl::init(true),
208 cl::desc("Enable CopyToPhi DAG Mutation"));
209
210/// A command line argument to force pipeliner to use specified issue
211/// width.
213 "pipeliner-force-issue-width",
214 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
215 cl::init(-1));
216
217/// A command line argument to set the window scheduling option.
220 cl::desc("Set how to use window scheduling algorithm."),
222 "Turn off window algorithm."),
224 "Use window algorithm after SMS algorithm fails."),
226 "Use window algorithm instead of SMS algorithm.")));
227
228unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
229char MachinePipeliner::ID = 0;
230#ifndef NDEBUG
232#endif
234
236 "Modulo Software Pipelining", false, false)
242 "Modulo Software Pipelining", false, false)
243
244namespace {
245
246/// This class holds an SUnit corresponding to a memory operation and other
247/// information related to the instruction.
251
252 /// The value of a memory operand.
253 const Value *MemOpValue = nullptr;
254
255 /// The offset of a memory operand.
256 int64_t MemOpOffset = 0;
257
259
260 /// True if all the underlying objects are identified.
261 bool IsAllIdentified = false;
262
264
265 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
266
267 bool isUnknown() const { return MemOpValue == nullptr; }
268
269private:
271};
272
273/// Add loop-carried chain dependencies. This class handles the same type of
274/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
275/// account dependencies across iterations.
277 // Type of instruction that is relevant to order-dependencies
278 enum class InstrTag {
279 Barrier = 0, ///< A barrier event instruction.
280 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
281 ///< not a barrier event.
282 FPExceptions = 2, ///< An instruction that does not match above, but may
283 ///< raise floatin-point exceptions.
284 };
285
286 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
287 TaggedSUnit(SUnit *SU, InstrTag Tag)
288 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
289
290 InstrTag getTag() const { return InstrTag(getInt()); }
291 };
292
293 /// Holds loads and stores with memory related information.
294 struct LoadStoreChunk {
297
298 void append(SUnit *SU);
299 };
300
302 BatchAAResults *BAA;
303 std::vector<SUnit> &SUnits;
304
305 /// The size of SUnits, for convenience.
306 const unsigned N;
307
308 /// Loop-carried Edges.
309 std::vector<BitVector> LoopCarried;
310
311 /// Instructions related to chain dependencies. They are one of the
312 /// following:
313 ///
314 /// 1. Barrier event.
315 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
316 /// value.
317 /// 3. Store, but not a barrier event.
318 /// 4. None of them, but may raise floating-point exceptions.
319 ///
320 /// This is used when analyzing loop-carried dependencies that access global
321 /// barrier instructions.
322 std::vector<TaggedSUnit> TaggedSUnits;
323
324 const TargetInstrInfo *TII = nullptr;
325 const TargetRegisterInfo *TRI = nullptr;
326
327public:
329 const TargetInstrInfo *TII,
330 const TargetRegisterInfo *TRI);
331
332 /// The main function to compute loop-carried order-dependencies.
333 void computeDependencies();
334
335 const BitVector &getLoopCarried(unsigned Idx) const {
336 return LoopCarried[Idx];
337 }
338
339private:
340 /// Tags to \p SU if the instruction may affect the order-dependencies.
341 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
342
343 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
344 const LoadStoreChunk &To);
345
346 /// Add a loop-carried order dependency between \p Src and \p Dst if we
347 /// cannot prove they are independent. When \p PerformCheapCheck is true, a
348 /// lightweight dependency test (referred to as "cheap check" below) is
349 /// performed at first. Note that the cheap check is retained to maintain the
350 /// existing behavior and not expected to be used anymore.
351 ///
352 /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
353 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
354 const SUnitWithMemInfo &Dst,
355 bool PerformCheapCheck = false);
356
357 void computeDependenciesAux();
358};
359
360} // end anonymous namespace
361
362/// The "main" function for implementing Swing Modulo Scheduling.
364 if (skipFunction(mf.getFunction()))
365 return false;
366
367 if (!EnableSWP)
368 return false;
369
370 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
371 !EnableSWPOptSize.getPosition())
372 return false;
373
375 return false;
376
377 // Cannot pipeline loops without instruction itineraries if we are using
378 // DFA for the pipeliner.
379 if (mf.getSubtarget().useDFAforSMS() &&
382 return false;
383
384 MF = &mf;
388 TII = MF->getSubtarget().getInstrInfo();
389 RegClassInfo.runOnMachineFunction(*MF);
390
391 for (const auto &L : *MLI)
392 scheduleLoop(*L);
393
394 return false;
395}
396
397/// Attempt to perform the SMS algorithm on the specified loop. This function is
398/// the main entry point for the algorithm. The function identifies candidate
399/// loops, calculates the minimum initiation interval, and attempts to schedule
400/// the loop.
401bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
402 bool Changed = false;
403 for (const auto &InnerLoop : L)
404 Changed |= scheduleLoop(*InnerLoop);
405
406#ifndef NDEBUG
407 // Stop trying after reaching the limit (if any).
408 int Limit = SwpLoopLimit;
409 if (Limit >= 0) {
410 if (NumTries >= SwpLoopLimit)
411 return Changed;
412 NumTries++;
413 }
414#endif
415
416 setPragmaPipelineOptions(L);
417 if (!canPipelineLoop(L)) {
418 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
419 ORE->emit([&]() {
420 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
421 L.getStartLoc(), L.getHeader())
422 << "Failed to pipeline loop";
423 });
424
425 LI.LoopPipelinerInfo.reset();
426 return Changed;
427 }
428
429 ++NumTrytoPipeline;
430 if (useSwingModuloScheduler())
431 Changed = swingModuloScheduler(L);
432
433 if (useWindowScheduler(Changed))
434 Changed = runWindowScheduler(L);
435
436 LI.LoopPipelinerInfo.reset();
437 return Changed;
438}
439
440void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
441 // Reset the pragma for the next loop in iteration.
442 disabledByPragma = false;
443 II_setByPragma = 0;
444
445 MachineBasicBlock *LBLK = L.getTopBlock();
446
447 if (LBLK == nullptr)
448 return;
449
450 const BasicBlock *BBLK = LBLK->getBasicBlock();
451 if (BBLK == nullptr)
452 return;
453
454 const Instruction *TI = BBLK->getTerminator();
455 if (TI == nullptr)
456 return;
457
458 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
459 if (LoopID == nullptr)
460 return;
461
462 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
463 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
464
465 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
466 MDNode *MD = dyn_cast<MDNode>(MDO);
467
468 if (MD == nullptr)
469 continue;
470
471 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
472
473 if (S == nullptr)
474 continue;
475
476 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
477 assert(MD->getNumOperands() == 2 &&
478 "Pipeline initiation interval hint metadata should have two operands.");
480 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
481 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
482 } else if (S->getString() == "llvm.loop.pipeline.disable") {
483 disabledByPragma = true;
484 }
485 }
486}
487
488/// Depth-first search to detect cycles among PHI dependencies.
489/// Returns true if a cycle is detected within the PHI-only subgraph.
490static bool hasPHICycleDFS(
491 unsigned Reg, const DenseMap<unsigned, SmallVector<unsigned, 2>> &PhiDeps,
492 SmallSet<unsigned, 8> &Visited, SmallSet<unsigned, 8> &RecStack) {
493
494 // If Reg is not a PHI-def it cannot contribute to a PHI cycle.
495 auto It = PhiDeps.find(Reg);
496 if (It == PhiDeps.end())
497 return false;
498
499 if (RecStack.count(Reg))
500 return true; // backedge.
501 if (Visited.count(Reg))
502 return false;
503
504 Visited.insert(Reg);
505 RecStack.insert(Reg);
506
507 for (unsigned Dep : It->second) {
508 if (hasPHICycleDFS(Dep, PhiDeps, Visited, RecStack))
509 return true;
510 }
511
512 RecStack.erase(Reg);
513 return false;
514}
515
516static bool hasPHICycle(const MachineBasicBlock *LoopHeader,
517 const MachineRegisterInfo &MRI) {
519
520 // Collect PHI nodes and their dependencies.
521 for (const MachineInstr &MI : LoopHeader->phis()) {
522 unsigned DefReg = MI.getOperand(0).getReg();
523 auto Ins = PhiDeps.try_emplace(DefReg).first;
524
525 // PHI operands are (Reg, MBB) pairs starting at index 1.
526 for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
527 Ins->second.push_back(MI.getOperand(I).getReg());
528 }
529
530 // DFS to detect cycles among PHI nodes.
531 SmallSet<unsigned, 8> Visited, RecStack;
532
533 // Start DFS from each PHI-def.
534 for (const auto &KV : PhiDeps) {
535 unsigned Reg = KV.first;
536 if (hasPHICycleDFS(Reg, PhiDeps, Visited, RecStack))
537 return true;
538 }
539
540 return false;
541}
542
543/// Return true if the loop can be software pipelined. The algorithm is
544/// restricted to loops with a single basic block. Make sure that the
545/// branch in the loop can be analyzed.
546bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
547 if (L.getNumBlocks() != 1) {
548 ORE->emit([&]() {
549 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
550 L.getStartLoc(), L.getHeader())
551 << "Not a single basic block: "
552 << ore::NV("NumBlocks", L.getNumBlocks());
553 });
554 return false;
555 }
556
557 if (hasPHICycle(L.getHeader(), MF->getRegInfo())) {
558 LLVM_DEBUG(dbgs() << "Cannot pipeline loop due to PHI cycle\n");
559 return false;
560 }
561
562 if (disabledByPragma) {
563 ORE->emit([&]() {
564 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
565 L.getStartLoc(), L.getHeader())
566 << "Disabled by Pragma.";
567 });
568 return false;
569 }
570
571 // Check if the branch can't be understood because we can't do pipelining
572 // if that's the case.
573 LI.TBB = nullptr;
574 LI.FBB = nullptr;
575 LI.BrCond.clear();
576 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
577 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
578 NumFailBranch++;
579 ORE->emit([&]() {
580 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
581 L.getStartLoc(), L.getHeader())
582 << "The branch can't be understood";
583 });
584 return false;
585 }
586
587 LI.LoopInductionVar = nullptr;
588 LI.LoopCompare = nullptr;
589 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
590 if (!LI.LoopPipelinerInfo) {
591 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
592 NumFailLoop++;
593 ORE->emit([&]() {
594 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
595 L.getStartLoc(), L.getHeader())
596 << "The loop structure is not supported";
597 });
598 return false;
599 }
600
601 if (!L.getLoopPreheader()) {
602 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
603 NumFailPreheader++;
604 ORE->emit([&]() {
605 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
606 L.getStartLoc(), L.getHeader())
607 << "No loop preheader found";
608 });
609 return false;
610 }
611
612 unsigned NumStores = 0;
613 for (MachineInstr &MI : *L.getHeader())
614 if (MI.mayStore())
615 ++NumStores;
616 if (NumStores > SwpMaxNumStores) {
617 LLVM_DEBUG(dbgs() << "Too many stores\n");
618 NumFailTooManyStores++;
619 ORE->emit([&]() {
620 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
621 L.getStartLoc(), L.getHeader())
622 << "Too many store instructions in the loop: "
623 << ore::NV("NumStores", NumStores) << " > "
624 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
625 });
626 return false;
627 }
628
629 // Remove any subregisters from inputs to phi nodes.
630 preprocessPhiNodes(*L.getHeader());
631 return true;
632}
633
634void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
635 MachineRegisterInfo &MRI = MF->getRegInfo();
636 SlotIndexes &Slots =
637 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
638
639 for (MachineInstr &PI : B.phis()) {
640 MachineOperand &DefOp = PI.getOperand(0);
641 assert(DefOp.getSubReg() == 0);
642 auto *RC = MRI.getRegClass(DefOp.getReg());
643
644 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
645 MachineOperand &RegOp = PI.getOperand(i);
646 if (RegOp.getSubReg() == 0)
647 continue;
648
649 // If the operand uses a subregister, replace it with a new register
650 // without subregisters, and generate a copy to the new register.
651 Register NewReg = MRI.createVirtualRegister(RC);
652 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
654 const DebugLoc &DL = PredB.findDebugLoc(At);
655 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
656 .addReg(RegOp.getReg(), getRegState(RegOp),
657 RegOp.getSubReg());
658 Slots.insertMachineInstrInMaps(*Copy);
659 RegOp.setReg(NewReg);
660 RegOp.setSubReg(0);
661 }
662 }
663}
664
665/// The SMS algorithm consists of the following main steps:
666/// 1. Computation and analysis of the dependence graph.
667/// 2. Ordering of the nodes (instructions).
668/// 3. Attempt to Schedule the loop.
669bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
670 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
671
672 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
673 SwingSchedulerDAG SMS(
675 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
676
677 MachineBasicBlock *MBB = L.getHeader();
678 // The kernel should not include any terminator instructions. These
679 // will be added back later.
680 SMS.startBlock(MBB);
681
682 // Compute the number of 'real' instructions in the basic block by
683 // ignoring terminators.
684 unsigned size = MBB->size();
686 E = MBB->instr_end();
687 I != E; ++I, --size)
688 ;
689
690 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
691 SMS.schedule();
692 SMS.exitRegion();
693
694 SMS.finishBlock();
695 return SMS.hasNewSchedule();
696}
697
708
709bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
710 MachineSchedContext Context;
711 Context.MF = MF;
712 Context.MLI = MLI;
713 Context.MDT = MDT;
714 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
715 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
716 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
717 Context.RegClassInfo->runOnMachineFunction(*MF);
718 WindowScheduler WS(&Context, L);
719 return WS.run();
720}
721
722bool MachinePipeliner::useSwingModuloScheduler() {
723 // SwingModuloScheduler does not work when WindowScheduler is forced.
725}
726
727bool MachinePipeliner::useWindowScheduler(bool Changed) {
728 // WindowScheduler does not work for following cases:
729 // 1. when it is off.
730 // 2. when SwingModuloScheduler is successfully scheduled.
731 // 3. when pragma II is enabled.
732 if (II_setByPragma) {
733 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
734 "llvm.loop.pipeline.initiationinterval is set.\n");
735 return false;
736 }
737
740}
741
742void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
743 if (SwpForceII > 0)
744 MII = SwpForceII;
745 else if (II_setByPragma > 0)
746 MII = II_setByPragma;
747 else
748 MII = std::max(ResMII, RecMII);
749}
750
751void SwingSchedulerDAG::setMAX_II() {
752 if (SwpForceII > 0)
753 MAX_II = SwpForceII;
754 else if (II_setByPragma > 0)
755 MAX_II = II_setByPragma;
756 else
757 MAX_II = MII + SwpIISearchRange;
758}
759
760/// We override the schedule function in ScheduleDAGInstrs to implement the
761/// scheduling part of the Swing Modulo Scheduling algorithm.
763 buildSchedGraph(AA);
764 const LoopCarriedEdges LCE = addLoopCarriedDependences();
765 updatePhiDependences();
766 Topo.InitDAGTopologicalSorting();
767 changeDependences();
768 postProcessDAG();
769 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
770 LLVM_DEBUG({
771 dump();
772 dbgs() << "===== Loop Carried Edges Begin =====\n";
773 for (SUnit &SU : SUnits)
774 LCE.dump(&SU, TRI, &MRI);
775 dbgs() << "===== Loop Carried Edges End =====\n";
776 });
777
778 NodeSetType NodeSets;
779 findCircuits(NodeSets);
780 NodeSetType Circuits = NodeSets;
781
782 // Calculate the MII.
783 unsigned ResMII = calculateResMII();
784 unsigned RecMII = calculateRecMII(NodeSets);
785
786 fuseRecs(NodeSets);
787
788 // This flag is used for testing and can cause correctness problems.
789 if (SwpIgnoreRecMII)
790 RecMII = 0;
791
792 setMII(ResMII, RecMII);
793 setMAX_II();
794
795 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
796 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
797
798 // Can't schedule a loop without a valid MII.
799 if (MII == 0) {
800 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
801 NumFailZeroMII++;
802 Pass.ORE->emit([&]() {
804 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
805 << "Invalid Minimal Initiation Interval: 0";
806 });
807 return;
808 }
809
810 // Don't pipeline large loops.
811 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
812 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
813 << ", we don't pipeline large loops\n");
814 NumFailLargeMaxMII++;
815 Pass.ORE->emit([&]() {
817 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
818 << "Minimal Initiation Interval too large: "
819 << ore::NV("MII", (int)MII) << " > "
820 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
821 << "Refer to -pipeliner-max-mii.";
822 });
823 return;
824 }
825
826 computeNodeFunctions(NodeSets);
827
828 registerPressureFilter(NodeSets);
829
830 colocateNodeSets(NodeSets);
831
832 checkNodeSets(NodeSets);
833
834 LLVM_DEBUG({
835 for (auto &I : NodeSets) {
836 dbgs() << " Rec NodeSet ";
837 I.dump();
838 }
839 });
840
841 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
842
843 groupRemainingNodes(NodeSets);
844
845 removeDuplicateNodes(NodeSets);
846
847 LLVM_DEBUG({
848 for (auto &I : NodeSets) {
849 dbgs() << " NodeSet ";
850 I.dump();
851 }
852 });
853
854 computeNodeOrder(NodeSets);
855
856 // check for node order issues
857 checkValidNodeOrder(Circuits);
858
859 SMSchedule Schedule(Pass.MF, this);
860 Scheduled = schedulePipeline(Schedule);
861
862 if (!Scheduled){
863 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
864 NumFailNoSchedule++;
865 Pass.ORE->emit([&]() {
867 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
868 << "Unable to find schedule";
869 });
870 return;
871 }
872
873 unsigned numStages = Schedule.getMaxStageCount();
874 // No need to generate pipeline if there are no overlapped iterations.
875 if (numStages == 0) {
876 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
877 NumFailZeroStage++;
878 Pass.ORE->emit([&]() {
880 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
881 << "No need to pipeline - no overlapped iterations in schedule.";
882 });
883 return;
884 }
885 // Check that the maximum stage count is less than user-defined limit.
886 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
887 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
888 << " : too many stages, abort\n");
889 NumFailLargeMaxStage++;
890 Pass.ORE->emit([&]() {
892 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
893 << "Too many stages in schedule: "
894 << ore::NV("numStages", (int)numStages) << " > "
895 << ore::NV("SwpMaxStages", SwpMaxStages)
896 << ". Refer to -pipeliner-max-stages.";
897 });
898 return;
899 }
900
901 Pass.ORE->emit([&]() {
902 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
903 Loop.getHeader())
904 << "Pipelined succesfully!";
905 });
906
907 // Generate the schedule as a ModuloSchedule.
908 DenseMap<MachineInstr *, int> Cycles, Stages;
909 std::vector<MachineInstr *> OrderedInsts;
910 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
911 ++Cycle) {
912 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
913 OrderedInsts.push_back(SU->getInstr());
914 Cycles[SU->getInstr()] = Cycle;
915 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
916 }
917 }
919 for (auto &KV : NewMIs) {
920 Cycles[KV.first] = Cycles[KV.second];
921 Stages[KV.first] = Stages[KV.second];
922 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
923 }
924
925 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
926 std::move(Stages));
928 assert(NewInstrChanges.empty() &&
929 "Cannot serialize a schedule with InstrChanges!");
931 MSTI.annotate();
932 return;
933 }
934 // The experimental code generator can't work if there are InstChanges.
935 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
936 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
937 MSE.expand();
938 } else if (MVECodeGen && NewInstrChanges.empty() &&
939 LoopPipelinerInfo->isMVEExpanderSupported() &&
941 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
942 MSE.expand();
943 } else {
944 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
945 MSE.expand();
946 MSE.cleanup();
947 }
948 ++NumPipelined;
949}
950
951/// Clean up after the software pipeliner runs.
953 for (auto &KV : NewMIs)
954 MF.deleteMachineInstr(KV.second);
955 NewMIs.clear();
956
957 // Call the superclass.
959}
960
961/// Return the register values for the operands of a Phi instruction.
962/// This function assume the instruction is a Phi.
964 Register &InitVal, Register &LoopVal) {
965 assert(Phi.isPHI() && "Expecting a Phi.");
966
967 InitVal = Register();
968 LoopVal = Register();
969 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
970 if (Phi.getOperand(i + 1).getMBB() != Loop)
971 InitVal = Phi.getOperand(i).getReg();
972 else
973 LoopVal = Phi.getOperand(i).getReg();
974
975 assert(InitVal && LoopVal && "Unexpected Phi structure.");
976}
977
978/// Return the Phi register value that comes the loop block.
980 const MachineBasicBlock *LoopBB) {
981 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
982 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
983 return Phi.getOperand(i).getReg();
984 return Register();
985}
986
987/// Return true if SUb can be reached from SUa following the chain edges.
988static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
991 Worklist.push_back(SUa);
992 while (!Worklist.empty()) {
993 const SUnit *SU = Worklist.pop_back_val();
994 for (const auto &SI : SU->Succs) {
995 SUnit *SuccSU = SI.getSUnit();
996 if (SI.getKind() == SDep::Order) {
997 if (Visited.count(SuccSU))
998 continue;
999 if (SuccSU == SUb)
1000 return true;
1001 Worklist.push_back(SuccSU);
1002 Visited.insert(SuccSU);
1003 }
1004 }
1005 }
1006 return false;
1007}
1008
1010 if (!getUnderlyingObjects())
1011 return;
1012 for (const Value *Obj : UnderlyingObjs)
1013 if (!isIdentifiedObject(Obj)) {
1014 IsAllIdentified = false;
1015 break;
1016 }
1017}
1018
1020 const SUnitWithMemInfo &Other) const {
1021 // If all underlying objects are identified objects and there is no overlap
1022 // between them, then these two instructions are disjoint.
1023 if (!IsAllIdentified || !Other.IsAllIdentified)
1024 return false;
1025 for (const Value *Obj : UnderlyingObjs)
1026 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
1027 return false;
1028 return true;
1029}
1030
1031/// Collect the underlying objects for the memory references of an instruction.
1032/// This function calls the code in ValueTracking, but first checks that the
1033/// instruction has a memory operand.
1034/// Returns false if we cannot find the underlying objects.
1035bool SUnitWithMemInfo::getUnderlyingObjects() {
1036 const MachineInstr *MI = SU->getInstr();
1037 if (!MI->hasOneMemOperand())
1038 return false;
1039 MachineMemOperand *MM = *MI->memoperands_begin();
1040 if (!MM->getValue())
1041 return false;
1042 MemOpValue = MM->getValue();
1043 MemOpOffset = MM->getOffset();
1045
1046 // TODO: A no alias scope may be valid only in a single iteration. In this
1047 // case we need to peel off it like LoopAccessAnalysis does.
1048 AATags = MM->getAAInfo();
1049 return true;
1050}
1051
1052/// Returns true if there is a loop-carried order dependency from \p Src to \p
1053/// Dst.
1054static bool
1055hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1056 BatchAAResults &BAA, const TargetInstrInfo *TII,
1057 const TargetRegisterInfo *TRI,
1058 const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
1059 if (Src.isTriviallyDisjoint(Dst))
1060 return false;
1061 if (isSuccOrder(Src.SU, Dst.SU))
1062 return false;
1063
1064 MachineInstr &SrcMI = *Src.SU->getInstr();
1065 MachineInstr &DstMI = *Dst.SU->getInstr();
1066 if (PerformCheapCheck) {
1067 // First, perform the cheaper check that compares the base register.
1068 // If they are the same and the load offset is less than the store
1069 // offset, then mark the dependence as loop carried potentially.
1070 //
1071 // TODO: This check will be removed.
1072 const MachineOperand *BaseOp1, *BaseOp2;
1073 int64_t Offset1, Offset2;
1074 bool Offset1IsScalable, Offset2IsScalable;
1075 if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
1076 TRI) &&
1077 TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
1078 TRI)) {
1079 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
1080 Offset1IsScalable == Offset2IsScalable &&
1081 (int)Offset1 < (int)Offset2) {
1082 assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1083 "What happened to the chain edge?");
1084 return true;
1085 }
1086 }
1087 }
1088
1089 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1090 return false;
1091
1092 // Second, the more expensive check that uses alias analysis on the
1093 // base registers. If they alias, and the load offset is less than
1094 // the store offset, the mark the dependence as loop carried.
1095 if (Src.isUnknown() || Dst.isUnknown())
1096 return true;
1097 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1098 return true;
1099
1100 if (BAA.isNoAlias(
1101 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1102 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1103 return false;
1104
1105 // AliasAnalysis sometimes gives up on following the underlying
1106 // object. In such a case, separate checks for underlying objects may
1107 // prove that there are no aliases between two accesses.
1108 for (const Value *SrcObj : Src.UnderlyingObjs)
1109 for (const Value *DstObj : Dst.UnderlyingObjs)
1110 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1111 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1112 return true;
1113
1114 return false;
1115}
1116
1117void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1118 const MachineInstr *MI = SU->getInstr();
1119 if (!MI->mayLoadOrStore())
1120 return;
1121 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1122}
1123
1125 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1126 const TargetRegisterInfo *TRI)
1127 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1128 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1129
1131 // Traverse all instructions and extract only what we are targetting.
1132 for (auto &SU : SUnits) {
1133 auto Tagged = getInstrTag(&SU);
1134
1135 // This instruction has no loop-carried order-dependencies.
1136 if (!Tagged)
1137 continue;
1138 TaggedSUnits.emplace_back(&SU, *Tagged);
1139 }
1140
1141 computeDependenciesAux();
1142}
1143
1144std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1145LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1146 MachineInstr *MI = SU->getInstr();
1147 if (TII->isGlobalMemoryObject(MI))
1148 return InstrTag::Barrier;
1149
1150 if (MI->mayStore() ||
1151 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1152 return InstrTag::LoadOrStore;
1153
1154 if (MI->mayRaiseFPException())
1155 return InstrTag::FPExceptions;
1156
1157 return std::nullopt;
1158}
1159
1160void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1161 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1162 bool PerformCheapCheck) {
1163 // Avoid self-dependencies.
1164 if (Src.SU == Dst.SU)
1165 return;
1166
1167 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1168 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1169}
1170
1171void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1172 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1173 // Add load-to-store dependencies (WAR).
1174 for (const SUnitWithMemInfo &Src : From.Loads)
1175 for (const SUnitWithMemInfo &Dst : To.Stores)
1176 // Perform a cheap check first if this is a forward dependency.
1177 addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
1178
1179 // Add store-to-load dependencies (RAW).
1180 for (const SUnitWithMemInfo &Src : From.Stores)
1181 for (const SUnitWithMemInfo &Dst : To.Loads)
1182 addDependenciesBetweenSUs(Src, Dst);
1183
1184 // Add store-to-store dependencies (WAW).
1185 for (const SUnitWithMemInfo &Src : From.Stores)
1186 for (const SUnitWithMemInfo &Dst : To.Stores)
1187 addDependenciesBetweenSUs(Src, Dst);
1188}
1189
1190void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1192 for (const auto &TSU : TaggedSUnits) {
1193 InstrTag Tag = TSU.getTag();
1194 SUnit *SU = TSU.getPointer();
1195 switch (Tag) {
1196 case InstrTag::Barrier:
1197 Chunks.emplace_back();
1198 break;
1199 case InstrTag::LoadOrStore:
1200 Chunks.back().append(SU);
1201 break;
1202 case InstrTag::FPExceptions:
1203 // TODO: Handle this properly.
1204 break;
1205 }
1206 }
1207
1208 // Add dependencies between memory operations. If there are one or more
1209 // barrier events between two memory instructions, we don't add a
1210 // loop-carried dependence for them.
1211 for (const LoadStoreChunk &Chunk : Chunks)
1212 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1213
1214 // TODO: If there are multiple barrier instructions, dependencies from the
1215 // last barrier instruction (or load/store below it) to the first barrier
1216 // instruction (or load/store above it).
1217}
1218
1219/// Add a chain edge between a load and store if the store can be an
1220/// alias of the load on a subsequent iteration, i.e., a loop carried
1221/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1222/// but that code doesn't create loop carried dependences.
1223/// TODO: Also compute output-dependencies.
1224LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1225 LoopCarriedEdges LCE;
1226
1227 // Add loop-carried order-dependencies
1228 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1229 LCODTracker.computeDependencies();
1230 for (unsigned I = 0; I != SUnits.size(); I++)
1231 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1232 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1233
1234 LCE.modifySUnits(SUnits, TII);
1235 return LCE;
1236}
1237
1238/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1239/// processes dependences for PHIs. This function adds true dependences
1240/// from a PHI to a use, and a loop carried dependence from the use to the
1241/// PHI. The loop carried dependence is represented as an anti dependence
1242/// edge. This function also removes chain dependences between unrelated
1243/// PHIs.
1244void SwingSchedulerDAG::updatePhiDependences() {
1245 SmallVector<SDep, 4> RemoveDeps;
1246 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1247
1248 // Iterate over each DAG node.
1249 for (SUnit &I : SUnits) {
1250 RemoveDeps.clear();
1251 // Set to true if the instruction has an operand defined by a Phi.
1252 Register HasPhiUse;
1253 Register HasPhiDef;
1254 MachineInstr *MI = I.getInstr();
1255 // Iterate over each operand, and we process the definitions.
1256 for (const MachineOperand &MO : MI->operands()) {
1257 if (!MO.isReg())
1258 continue;
1259 Register Reg = MO.getReg();
1260 if (MO.isDef()) {
1261 // If the register is used by a Phi, then create an anti dependence.
1263 UI = MRI.use_instr_begin(Reg),
1264 UE = MRI.use_instr_end();
1265 UI != UE; ++UI) {
1266 MachineInstr *UseMI = &*UI;
1267 SUnit *SU = getSUnit(UseMI);
1268 if (SU != nullptr && UseMI->isPHI()) {
1269 if (!MI->isPHI()) {
1270 SDep Dep(SU, SDep::Anti, Reg);
1271 Dep.setLatency(1);
1272 I.addPred(Dep);
1273 } else {
1274 HasPhiDef = Reg;
1275 // Add a chain edge to a dependent Phi that isn't an existing
1276 // predecessor.
1277
1278 // %3:intregs = PHI %21:intregs, %bb.6, %7:intregs, %bb.1 - SU0
1279 // %7:intregs = PHI %21:intregs, %bb.6, %13:intregs, %bb.1 - SU1
1280 // %27:intregs = A2_zxtb %3:intregs - SU2
1281 // %13:intregs = C2_muxri %45:predregs, 0, %46:intreg
1282 // If we have dependent phis, SU0 should be the successor of SU1
1283 // not the other way around. (it used to be SU1 is the successor
1284 // of SU0). In some cases, SU0 is scheduled earlier than SU1
1285 // resulting in bad IR as we do not have a value that can be used
1286 // by SU2.
1287
1288 if (SU->NodeNum < I.NodeNum && !SU->isPred(&I))
1289 SU->addPred(SDep(&I, SDep::Barrier));
1290 }
1291 }
1292 }
1293 } else if (MO.isUse()) {
1294 // If the register is defined by a Phi, then create a true dependence.
1295 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1296 if (DefMI == nullptr)
1297 continue;
1298 SUnit *SU = getSUnit(DefMI);
1299 if (SU != nullptr && DefMI->isPHI()) {
1300 if (!MI->isPHI()) {
1301 SDep Dep(SU, SDep::Data, Reg);
1302 Dep.setLatency(0);
1303 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1304 &SchedModel);
1305 I.addPred(Dep);
1306 } else {
1307 HasPhiUse = Reg;
1308 // Add a chain edge to a dependent Phi that isn't an existing
1309 // predecessor.
1310 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1311 I.addPred(SDep(SU, SDep::Barrier));
1312 }
1313 }
1314 }
1315 }
1316 // Remove order dependences from an unrelated Phi.
1317 if (!SwpPruneDeps)
1318 continue;
1319 for (auto &PI : I.Preds) {
1320 MachineInstr *PMI = PI.getSUnit()->getInstr();
1321 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1322 if (I.getInstr()->isPHI()) {
1323 if (PMI->getOperand(0).getReg() == HasPhiUse)
1324 continue;
1325 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1326 continue;
1327 }
1328 RemoveDeps.push_back(PI);
1329 }
1330 }
1331 for (const SDep &D : RemoveDeps)
1332 I.removePred(D);
1333 }
1334}
1335
1336/// Iterate over each DAG node and see if we can change any dependences
1337/// in order to reduce the recurrence MII.
1338void SwingSchedulerDAG::changeDependences() {
1339 // See if an instruction can use a value from the previous iteration.
1340 // If so, we update the base and offset of the instruction and change
1341 // the dependences.
1342 for (SUnit &I : SUnits) {
1343 unsigned BasePos = 0, OffsetPos = 0;
1344 Register NewBase;
1345 int64_t NewOffset = 0;
1346 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1347 NewOffset))
1348 continue;
1349
1350 // Get the MI and SUnit for the instruction that defines the original base.
1351 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1352 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1353 if (!DefMI)
1354 continue;
1355 SUnit *DefSU = getSUnit(DefMI);
1356 if (!DefSU)
1357 continue;
1358 // Get the MI and SUnit for the instruction that defins the new base.
1359 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1360 if (!LastMI)
1361 continue;
1362 SUnit *LastSU = getSUnit(LastMI);
1363 if (!LastSU)
1364 continue;
1365
1366 if (Topo.IsReachable(&I, LastSU))
1367 continue;
1368
1369 // Remove the dependence. The value now depends on a prior iteration.
1371 for (const SDep &P : I.Preds)
1372 if (P.getSUnit() == DefSU)
1373 Deps.push_back(P);
1374 for (const SDep &D : Deps) {
1375 Topo.RemovePred(&I, D.getSUnit());
1376 I.removePred(D);
1377 }
1378 // Remove the chain dependence between the instructions.
1379 Deps.clear();
1380 for (auto &P : LastSU->Preds)
1381 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1382 Deps.push_back(P);
1383 for (const SDep &D : Deps) {
1384 Topo.RemovePred(LastSU, D.getSUnit());
1385 LastSU->removePred(D);
1386 }
1387
1388 // Add a dependence between the new instruction and the instruction
1389 // that defines the new base.
1390 SDep Dep(&I, SDep::Anti, NewBase);
1391 Topo.AddPred(LastSU, &I);
1392 LastSU->addPred(Dep);
1393
1394 // Remember the base and offset information so that we can update the
1395 // instruction during code generation.
1396 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1397 }
1398}
1399
1400/// Create an instruction stream that represents a single iteration and stage of
1401/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1402/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1403/// function is an approximation of SMSchedule::finalizeSchedule with all
1404/// non-const operations removed.
1406 SMSchedule &Schedule,
1407 std::vector<MachineInstr *> &OrderedInsts,
1410
1411 // Move all instructions to the first stage from the later stages.
1412 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1413 ++Cycle) {
1414 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1415 Stage <= LastStage; ++Stage) {
1416 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1417 Cycle + Stage * Schedule.getInitiationInterval()))) {
1418 Instrs[Cycle].push_front(SU);
1419 }
1420 }
1421 }
1422
1423 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1424 ++Cycle) {
1425 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1426 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1427 for (SUnit *SU : CycleInstrs) {
1428 MachineInstr *MI = SU->getInstr();
1429 OrderedInsts.push_back(MI);
1430 Stages[MI] = Schedule.stageScheduled(SU);
1431 }
1432 }
1433}
1434
1435namespace {
1436
1437// FuncUnitSorter - Comparison operator used to sort instructions by
1438// the number of functional unit choices.
1439struct FuncUnitSorter {
1440 const InstrItineraryData *InstrItins;
1441 const MCSubtargetInfo *STI;
1442 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1443
1444 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1445 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1446
1447 // Compute the number of functional unit alternatives needed
1448 // at each stage, and take the minimum value. We prioritize the
1449 // instructions by the least number of choices first.
1450 unsigned minFuncUnits(const MachineInstr *Inst,
1451 InstrStage::FuncUnits &F) const {
1452 unsigned SchedClass = Inst->getDesc().getSchedClass();
1453 unsigned min = UINT_MAX;
1454 if (InstrItins && !InstrItins->isEmpty()) {
1455 for (const InstrStage &IS :
1456 make_range(InstrItins->beginStage(SchedClass),
1457 InstrItins->endStage(SchedClass))) {
1458 InstrStage::FuncUnits funcUnits = IS.getUnits();
1459 unsigned numAlternatives = llvm::popcount(funcUnits);
1460 if (numAlternatives < min) {
1461 min = numAlternatives;
1462 F = funcUnits;
1463 }
1464 }
1465 return min;
1466 }
1467 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1468 const MCSchedClassDesc *SCDesc =
1469 STI->getSchedModel().getSchedClassDesc(SchedClass);
1470 if (!SCDesc->isValid())
1471 // No valid Schedule Class Desc for schedClass, should be
1472 // Pseudo/PostRAPseudo
1473 return min;
1474
1475 for (const MCWriteProcResEntry &PRE :
1476 make_range(STI->getWriteProcResBegin(SCDesc),
1477 STI->getWriteProcResEnd(SCDesc))) {
1478 if (!PRE.ReleaseAtCycle)
1479 continue;
1480 const MCProcResourceDesc *ProcResource =
1481 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1482 unsigned NumUnits = ProcResource->NumUnits;
1483 if (NumUnits < min) {
1484 min = NumUnits;
1485 F = PRE.ProcResourceIdx;
1486 }
1487 }
1488 return min;
1489 }
1490 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1491 }
1492
1493 // Compute the critical resources needed by the instruction. This
1494 // function records the functional units needed by instructions that
1495 // must use only one functional unit. We use this as a tie breaker
1496 // for computing the resource MII. The instrutions that require
1497 // the same, highly used, functional unit have high priority.
1498 void calcCriticalResources(MachineInstr &MI) {
1499 unsigned SchedClass = MI.getDesc().getSchedClass();
1500 if (InstrItins && !InstrItins->isEmpty()) {
1501 for (const InstrStage &IS :
1502 make_range(InstrItins->beginStage(SchedClass),
1503 InstrItins->endStage(SchedClass))) {
1504 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1505 if (llvm::popcount(FuncUnits) == 1)
1506 Resources[FuncUnits]++;
1507 }
1508 return;
1509 }
1510 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1511 const MCSchedClassDesc *SCDesc =
1512 STI->getSchedModel().getSchedClassDesc(SchedClass);
1513 if (!SCDesc->isValid())
1514 // No valid Schedule Class Desc for schedClass, should be
1515 // Pseudo/PostRAPseudo
1516 return;
1517
1518 for (const MCWriteProcResEntry &PRE :
1519 make_range(STI->getWriteProcResBegin(SCDesc),
1520 STI->getWriteProcResEnd(SCDesc))) {
1521 if (!PRE.ReleaseAtCycle)
1522 continue;
1523 Resources[PRE.ProcResourceIdx]++;
1524 }
1525 return;
1526 }
1527 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1528 }
1529
1530 /// Return true if IS1 has less priority than IS2.
1531 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1532 InstrStage::FuncUnits F1 = 0, F2 = 0;
1533 unsigned MFUs1 = minFuncUnits(IS1, F1);
1534 unsigned MFUs2 = minFuncUnits(IS2, F2);
1535 if (MFUs1 == MFUs2)
1536 return Resources.lookup(F1) < Resources.lookup(F2);
1537 return MFUs1 > MFUs2;
1538 }
1539};
1540
1541/// Calculate the maximum register pressure of the scheduled instructions stream
1542class HighRegisterPressureDetector {
1543 MachineBasicBlock *OrigMBB;
1544 const MachineRegisterInfo &MRI;
1545 const TargetRegisterInfo *TRI;
1546
1547 const unsigned PSetNum;
1548
1549 // Indexed by PSet ID
1550 // InitSetPressure takes into account the register pressure of live-in
1551 // registers. It's not depend on how the loop is scheduled, so it's enough to
1552 // calculate them once at the beginning.
1553 std::vector<unsigned> InitSetPressure;
1554
1555 // Indexed by PSet ID
1556 // Upper limit for each register pressure set
1557 std::vector<unsigned> PressureSetLimit;
1558
1559 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1560
1561 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1562
1563public:
1564 using OrderedInstsTy = std::vector<MachineInstr *>;
1565 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1566
1567private:
1568 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1569 if (Pressures.size() == 0) {
1570 dbgs() << "[]";
1571 } else {
1572 char Prefix = '[';
1573 for (unsigned P : Pressures) {
1574 dbgs() << Prefix << P;
1575 Prefix = ' ';
1576 }
1577 dbgs() << ']';
1578 }
1579 }
1580
1581 void dumpPSet(Register Reg) const {
1582 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1583 // FIXME: The static_cast is a bug compensating bugs in the callers.
1584 VirtRegOrUnit VRegOrUnit =
1585 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1586 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1587 for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
1588 ++PSetIter) {
1589 dbgs() << *PSetIter << ' ';
1590 }
1591 dbgs() << '\n';
1592 }
1593
1594 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1595 Register Reg) const {
1596 // FIXME: The static_cast is a bug compensating bugs in the callers.
1597 VirtRegOrUnit VRegOrUnit =
1598 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1599 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1600 auto PSetIter = MRI.getPressureSets(VRegOrUnit);
1601 unsigned Weight = PSetIter.getWeight();
1602 for (; PSetIter.isValid(); ++PSetIter)
1603 Pressure[*PSetIter] += Weight;
1604 }
1605
1606 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1607 Register Reg) const {
1608 auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
1609 unsigned Weight = PSetIter.getWeight();
1610 for (; PSetIter.isValid(); ++PSetIter) {
1611 auto &P = Pressure[*PSetIter];
1612 assert(P >= Weight &&
1613 "register pressure must be greater than or equal weight");
1614 P -= Weight;
1615 }
1616 }
1617
1618 // Return true if Reg is reserved one, for example, stack pointer
1619 bool isReservedRegister(Register Reg) const {
1620 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1621 }
1622
1623 bool isDefinedInThisLoop(Register Reg) const {
1624 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1625 }
1626
1627 // Search for live-in variables. They are factored into the register pressure
1628 // from the begining. Live-in variables used by every iteration should be
1629 // considered as alive throughout the loop. For example, the variable `c` in
1630 // following code. \code
1631 // int c = ...;
1632 // for (int i = 0; i < n; i++)
1633 // a[i] += b[i] + c;
1634 // \endcode
1635 void computeLiveIn() {
1636 DenseSet<Register> Used;
1637 for (auto &MI : *OrigMBB) {
1638 if (MI.isDebugInstr())
1639 continue;
1640 for (auto &Use : ROMap[&MI].Uses) {
1641 // FIXME: The static_cast is a bug.
1642 Register Reg =
1643 Use.VRegOrUnit.isVirtualReg()
1644 ? Use.VRegOrUnit.asVirtualReg()
1645 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1646 // Ignore the variable that appears only on one side of phi instruction
1647 // because it's used only at the first iteration.
1648 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1649 continue;
1650 if (isReservedRegister(Reg))
1651 continue;
1652 if (isDefinedInThisLoop(Reg))
1653 continue;
1654 Used.insert(Reg);
1655 }
1656 }
1657
1658 for (auto LiveIn : Used)
1659 increaseRegisterPressure(InitSetPressure, LiveIn);
1660 }
1661
1662 // Calculate the upper limit of each pressure set
1663 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1664 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1665 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1666 }
1667
1668 // There are two patterns of last-use.
1669 // - by an instruction of the current iteration
1670 // - by a phi instruction of the next iteration (loop carried value)
1671 //
1672 // Furthermore, following two groups of instructions are executed
1673 // simultaneously
1674 // - next iteration's phi instructions in i-th stage
1675 // - current iteration's instructions in i+1-th stage
1676 //
1677 // This function calculates the last-use of each register while taking into
1678 // account the above two patterns.
1679 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1680 Instr2StageTy &Stages) const {
1681 // We treat virtual registers that are defined and used in this loop.
1682 // Following virtual register will be ignored
1683 // - live-in one
1684 // - defined but not used in the loop (potentially live-out)
1685 DenseSet<Register> TargetRegs;
1686 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1687 if (isDefinedInThisLoop(Reg))
1688 TargetRegs.insert(Reg);
1689 };
1690 for (MachineInstr *MI : OrderedInsts) {
1691 if (MI->isPHI()) {
1692 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1693 UpdateTargetRegs(Reg);
1694 } else {
1695 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1696 // FIXME: The static_cast is a bug.
1697 Register Reg = Use.VRegOrUnit.isVirtualReg()
1698 ? Use.VRegOrUnit.asVirtualReg()
1699 : Register(static_cast<unsigned>(
1700 Use.VRegOrUnit.asMCRegUnit()));
1701 UpdateTargetRegs(Reg);
1702 }
1703 }
1704 }
1705
1706 const auto InstrScore = [&Stages](MachineInstr *MI) {
1707 return Stages[MI] + MI->isPHI();
1708 };
1709
1710 DenseMap<Register, MachineInstr *> LastUseMI;
1711 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1712 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1713 // FIXME: The static_cast is a bug.
1714 Register Reg =
1715 Use.VRegOrUnit.isVirtualReg()
1716 ? Use.VRegOrUnit.asVirtualReg()
1717 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1718 if (!TargetRegs.contains(Reg))
1719 continue;
1720 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1721 if (!Inserted) {
1722 MachineInstr *Orig = Ite->second;
1723 MachineInstr *New = MI;
1724 if (InstrScore(Orig) < InstrScore(New))
1725 Ite->second = New;
1726 }
1727 }
1728 }
1729
1730 Instr2LastUsesTy LastUses;
1731 for (auto [Reg, MI] : LastUseMI)
1732 LastUses[MI].insert(Reg);
1733 return LastUses;
1734 }
1735
1736 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1737 // iterations and check the register pressure at the point where all stages
1738 // overlapping.
1739 //
1740 // An example of unrolled loop where #Stage is 4..
1741 // Iter i+0 i+1 i+2 i+3
1742 // ------------------------
1743 // Stage 0
1744 // Stage 1 0
1745 // Stage 2 1 0
1746 // Stage 3 2 1 0 <- All stages overlap
1747 //
1748 std::vector<unsigned>
1749 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1750 Instr2StageTy &Stages,
1751 const unsigned StageCount) const {
1752 using RegSetTy = SmallDenseSet<Register, 16>;
1753
1754 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1755 // manage the liveness of the registers independently by iterations.
1756 SmallVector<RegSetTy> LiveRegSets(StageCount);
1757
1758 auto CurSetPressure = InitSetPressure;
1759 auto MaxSetPressure = InitSetPressure;
1760 auto LastUses = computeLastUses(OrderedInsts, Stages);
1761
1762 LLVM_DEBUG({
1763 dbgs() << "Ordered instructions:\n";
1764 for (MachineInstr *MI : OrderedInsts) {
1765 dbgs() << "Stage " << Stages[MI] << ": ";
1766 MI->dump();
1767 }
1768 });
1769
1770 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1771 VirtRegOrUnit VRegOrUnit) {
1772 // FIXME: The static_cast is a bug.
1773 Register Reg =
1774 VRegOrUnit.isVirtualReg()
1775 ? VRegOrUnit.asVirtualReg()
1776 : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
1777 if (!Reg.isValid() || isReservedRegister(Reg))
1778 return;
1779
1780 bool Inserted = RegSet.insert(Reg).second;
1781 if (!Inserted)
1782 return;
1783
1784 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1785 increaseRegisterPressure(CurSetPressure, Reg);
1786 LLVM_DEBUG(dumpPSet(Reg));
1787 };
1788
1789 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1790 Register Reg) {
1791 if (!Reg.isValid() || isReservedRegister(Reg))
1792 return;
1793
1794 // live-in register
1795 if (!RegSet.contains(Reg))
1796 return;
1797
1798 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1799 RegSet.erase(Reg);
1800 decreaseRegisterPressure(CurSetPressure, Reg);
1801 LLVM_DEBUG(dumpPSet(Reg));
1802 };
1803
1804 for (unsigned I = 0; I < StageCount; I++) {
1805 for (MachineInstr *MI : OrderedInsts) {
1806 const auto Stage = Stages[MI];
1807 if (I < Stage)
1808 continue;
1809
1810 const unsigned Iter = I - Stage;
1811
1812 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1813 InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
1814
1815 for (auto LastUse : LastUses[MI]) {
1816 if (MI->isPHI()) {
1817 if (Iter != 0)
1818 EraseReg(LiveRegSets[Iter - 1], LastUse);
1819 } else {
1820 EraseReg(LiveRegSets[Iter], LastUse);
1821 }
1822 }
1823
1824 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1825 MaxSetPressure[PSet] =
1826 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1827
1828 LLVM_DEBUG({
1829 dbgs() << "CurSetPressure=";
1830 dumpRegisterPressures(CurSetPressure);
1831 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1832 MI->dump();
1833 });
1834 }
1835 }
1836
1837 return MaxSetPressure;
1838 }
1839
1840public:
1841 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1842 const MachineFunction &MF)
1843 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1844 TRI(MF.getSubtarget().getRegisterInfo()),
1845 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1846 PressureSetLimit(PSetNum, 0) {}
1847
1848 // Used to calculate register pressure, which is independent of loop
1849 // scheduling.
1850 void init(const RegisterClassInfo &RCI) {
1851 for (MachineInstr &MI : *OrigMBB) {
1852 if (MI.isDebugInstr())
1853 continue;
1854 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1855 }
1856
1857 computeLiveIn();
1858 computePressureSetLimit(RCI);
1859 }
1860
1861 // Calculate the maximum register pressures of the loop and check if they
1862 // exceed the limit
1863 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1864 const unsigned MaxStage) const {
1866 "the percentage of the margin must be between 0 to 100");
1867
1868 OrderedInstsTy OrderedInsts;
1869 Instr2StageTy Stages;
1870 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1871 const auto MaxSetPressure =
1872 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1873
1874 LLVM_DEBUG({
1875 dbgs() << "Dump MaxSetPressure:\n";
1876 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1877 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1878 }
1879 dbgs() << '\n';
1880 });
1881
1882 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1883 unsigned Limit = PressureSetLimit[PSet];
1884 unsigned Margin = Limit * RegPressureMargin / 100;
1885 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1886 << " Margin=" << Margin << "\n");
1887 if (Limit < MaxSetPressure[PSet] + Margin) {
1888 LLVM_DEBUG(
1889 dbgs()
1890 << "Rejected the schedule because of too high register pressure\n");
1891 return true;
1892 }
1893 }
1894 return false;
1895 }
1896};
1897
1898} // end anonymous namespace
1899
1900/// Calculate the resource constrained minimum initiation interval for the
1901/// specified loop. We use the DFA to model the resources needed for
1902/// each instruction, and we ignore dependences. A different DFA is created
1903/// for each cycle that is required. When adding a new instruction, we attempt
1904/// to add it to each existing DFA, until a legal space is found. If the
1905/// instruction cannot be reserved in an existing DFA, we create a new one.
1906unsigned SwingSchedulerDAG::calculateResMII() {
1907 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1908 ResourceManager RM(&MF.getSubtarget(), this);
1909 return RM.calculateResMII();
1910}
1911
1912/// Calculate the recurrence-constrainted minimum initiation interval.
1913/// Iterate over each circuit. Compute the delay(c) and distance(c)
1914/// for each circuit. The II needs to satisfy the inequality
1915/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1916/// II that satisfies the inequality, and the RecMII is the maximum
1917/// of those values.
1918unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1919 unsigned RecMII = 0;
1920
1921 for (NodeSet &Nodes : NodeSets) {
1922 if (Nodes.empty())
1923 continue;
1924
1925 unsigned Delay = Nodes.getLatency();
1926 unsigned Distance = 1;
1927
1928 // ii = ceil(delay / distance)
1929 unsigned CurMII = (Delay + Distance - 1) / Distance;
1930 Nodes.setRecMII(CurMII);
1931 if (CurMII > RecMII)
1932 RecMII = CurMII;
1933 }
1934
1935 return RecMII;
1936}
1937
1938/// Create the adjacency structure of the nodes in the graph.
1939void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1940 SwingSchedulerDAG *DAG) {
1941 BitVector Added(SUnits.size());
1942 DenseMap<int, int> OutputDeps;
1943 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1944 Added.reset();
1945 // Add any successor to the adjacency matrix and exclude duplicates.
1946 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1947 // Only create a back-edge on the first and last nodes of a dependence
1948 // chain. This records any chains and adds them later.
1949 if (OE.isOutputDep()) {
1950 int N = OE.getDst()->NodeNum;
1951 int BackEdge = i;
1952 auto Dep = OutputDeps.find(BackEdge);
1953 if (Dep != OutputDeps.end()) {
1954 BackEdge = Dep->second;
1955 OutputDeps.erase(Dep);
1956 }
1957 OutputDeps[N] = BackEdge;
1958 }
1959 // Do not process a boundary node, an artificial node.
1960 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1961 continue;
1962
1963 // This code is retained o preserve previous behavior and prevent
1964 // regression. This condition means that anti-dependnecies within an
1965 // iteration are ignored when searching circuits. Therefore it's natural
1966 // to consider this dependence as well.
1967 // FIXME: Remove this code if it doesn't have significant impact on
1968 // performance.
1969 if (OE.isAntiDep())
1970 continue;
1971
1972 int N = OE.getDst()->NodeNum;
1973 if (!Added.test(N)) {
1974 AdjK[i].push_back(N);
1975 Added.set(N);
1976 }
1977 }
1978 // A chain edge between a store and a load is treated as a back-edge in the
1979 // adjacency matrix.
1980 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1981 SUnit *Src = IE.getSrc();
1982 SUnit *Dst = IE.getDst();
1983 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1984 continue;
1985 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1986 int N = Src->NodeNum;
1987 if (!Added.test(N)) {
1988 AdjK[i].push_back(N);
1989 Added.set(N);
1990 }
1991 }
1992 }
1993 }
1994 // Add back-edges in the adjacency matrix for the output dependences.
1995 for (auto &OD : OutputDeps)
1996 if (!Added.test(OD.second)) {
1997 AdjK[OD.first].push_back(OD.second);
1998 Added.set(OD.second);
1999 }
2000}
2001
2002/// Identify an elementary circuit in the dependence graph starting at the
2003/// specified node.
2004bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
2005 const SwingSchedulerDAG *DAG,
2006 bool HasBackedge) {
2007 SUnit *SV = &SUnits[V];
2008 bool F = false;
2009 Stack.insert(SV);
2010 Blocked.set(V);
2011
2012 for (auto W : AdjK[V]) {
2013 if (NumPaths > MaxPaths)
2014 break;
2015 if (W < S)
2016 continue;
2017 if (W == S) {
2018 if (!HasBackedge)
2019 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
2020 F = true;
2021 ++NumPaths;
2022 break;
2023 }
2024 if (!Blocked.test(W)) {
2025 if (circuit(W, S, NodeSets, DAG,
2026 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
2027 F = true;
2028 }
2029 }
2030
2031 if (F)
2032 unblock(V);
2033 else {
2034 for (auto W : AdjK[V]) {
2035 if (W < S)
2036 continue;
2037 B[W].insert(SV);
2038 }
2039 }
2040 Stack.pop_back();
2041 return F;
2042}
2043
2044/// Unblock a node in the circuit finding algorithm.
2045void SwingSchedulerDAG::Circuits::unblock(int U) {
2046 Blocked.reset(U);
2047 SmallPtrSet<SUnit *, 4> &BU = B[U];
2048 while (!BU.empty()) {
2049 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
2050 assert(SI != BU.end() && "Invalid B set.");
2051 SUnit *W = *SI;
2052 BU.erase(W);
2053 if (Blocked.test(W->NodeNum))
2054 unblock(W->NodeNum);
2055 }
2056}
2057
2058/// Identify all the elementary circuits in the dependence graph using
2059/// Johnson's circuit algorithm.
2060void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
2061 Circuits Cir(SUnits, Topo);
2062 // Create the adjacency structure.
2063 Cir.createAdjacencyStructure(this);
2064 for (int I = 0, E = SUnits.size(); I != E; ++I) {
2065 Cir.reset();
2066 Cir.circuit(I, I, NodeSets, this);
2067 }
2068}
2069
2070// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
2071// is loop-carried to the USE in next iteration. This will help pipeliner avoid
2072// additional copies that are needed across iterations. An artificial dependence
2073// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
2074
2075// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
2076// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
2077// PHI-------True-Dep------> USEOfPhi
2078
2079// The mutation creates
2080// USEOfPHI -------Artificial-Dep---> SRCOfCopy
2081
2082// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
2083// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
2084// late to avoid additional copies across iterations. The possible scheduling
2085// order would be
2086// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
2087
2088void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
2089 for (SUnit &SU : DAG->SUnits) {
2090 // Find the COPY/REG_SEQUENCE instruction.
2091 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
2092 continue;
2093
2094 // Record the loop carried PHIs.
2096 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2098
2099 for (auto &Dep : SU.Preds) {
2100 SUnit *TmpSU = Dep.getSUnit();
2101 MachineInstr *TmpMI = TmpSU->getInstr();
2102 SDep::Kind DepKind = Dep.getKind();
2103 // Save the loop carried PHI.
2104 if (DepKind == SDep::Anti && TmpMI->isPHI())
2105 PHISUs.push_back(TmpSU);
2106 // Save the source of COPY/REG_SEQUENCE.
2107 // If the source has no pre-decessors, we will end up creating cycles.
2108 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2109 SrcSUs.push_back(TmpSU);
2110 }
2111
2112 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2113 continue;
2114
2115 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2116 // SUnit to the container.
2118 // Do not use iterator based loop here as we are updating the container.
2119 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2120 for (auto &Dep : PHISUs[Index]->Succs) {
2121 if (Dep.getKind() != SDep::Data)
2122 continue;
2123
2124 SUnit *TmpSU = Dep.getSUnit();
2125 MachineInstr *TmpMI = TmpSU->getInstr();
2126 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2127 PHISUs.push_back(TmpSU);
2128 continue;
2129 }
2130 UseSUs.push_back(TmpSU);
2131 }
2132 }
2133
2134 if (UseSUs.size() == 0)
2135 continue;
2136
2137 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2138 // Add the artificial dependencies if it does not form a cycle.
2139 for (auto *I : UseSUs) {
2140 for (auto *Src : SrcSUs) {
2141 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2142 Src->addPred(SDep(I, SDep::Artificial));
2143 SDAG->Topo.AddPred(Src, I);
2144 }
2145 }
2146 }
2147 }
2148}
2149
2150/// Compute several functions need to order the nodes for scheduling.
2151/// ASAP - Earliest time to schedule a node.
2152/// ALAP - Latest time to schedule a node.
2153/// MOV - Mobility function, difference between ALAP and ASAP.
2154/// D - Depth of each node.
2155/// H - Height of each node.
2156void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2157 ScheduleInfo.resize(SUnits.size());
2158
2159 LLVM_DEBUG({
2160 for (int I : Topo) {
2161 const SUnit &SU = SUnits[I];
2162 dumpNode(SU);
2163 }
2164 });
2165
2166 int maxASAP = 0;
2167 // Compute ASAP and ZeroLatencyDepth.
2168 for (int I : Topo) {
2169 int asap = 0;
2170 int zeroLatencyDepth = 0;
2171 SUnit *SU = &SUnits[I];
2172 for (const auto &IE : DDG->getInEdges(SU)) {
2173 SUnit *Pred = IE.getSrc();
2174 if (IE.getLatency() == 0)
2175 zeroLatencyDepth =
2176 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2177 if (IE.ignoreDependence(true))
2178 continue;
2179 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2180 IE.getDistance() * MII));
2181 }
2182 maxASAP = std::max(maxASAP, asap);
2183 ScheduleInfo[I].ASAP = asap;
2184 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2185 }
2186
2187 // Compute ALAP, ZeroLatencyHeight, and MOV.
2188 for (int I : llvm::reverse(Topo)) {
2189 int alap = maxASAP;
2190 int zeroLatencyHeight = 0;
2191 SUnit *SU = &SUnits[I];
2192 for (const auto &OE : DDG->getOutEdges(SU)) {
2193 SUnit *Succ = OE.getDst();
2194 if (Succ->isBoundaryNode())
2195 continue;
2196 if (OE.getLatency() == 0)
2197 zeroLatencyHeight =
2198 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2199 if (OE.ignoreDependence(true))
2200 continue;
2201 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2202 OE.getDistance() * MII));
2203 }
2204
2205 ScheduleInfo[I].ALAP = alap;
2206 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2207 }
2208
2209 // After computing the node functions, compute the summary for each node set.
2210 for (NodeSet &I : NodeSets)
2211 I.computeNodeSetInfo(this);
2212
2213 LLVM_DEBUG({
2214 for (unsigned i = 0; i < SUnits.size(); i++) {
2215 dbgs() << "\tNode " << i << ":\n";
2216 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2217 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2218 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2219 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2220 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2221 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2222 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2223 }
2224 });
2225}
2226
2227/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2228/// as the predecessors of the elements of NodeOrder that are not also in
2229/// NodeOrder.
2232 const NodeSet *S = nullptr) {
2233 Preds.clear();
2234
2235 for (SUnit *SU : NodeOrder) {
2236 for (const auto &IE : DDG->getInEdges(SU)) {
2237 SUnit *PredSU = IE.getSrc();
2238 if (S && S->count(PredSU) == 0)
2239 continue;
2240 if (IE.ignoreDependence(true))
2241 continue;
2242 if (NodeOrder.count(PredSU) == 0)
2243 Preds.insert(PredSU);
2244 }
2245
2246 // FIXME: The following loop-carried dependencies may also need to be
2247 // considered.
2248 // - Physical register dependencies (true-dependence and WAW).
2249 // - Memory dependencies.
2250 for (const auto &OE : DDG->getOutEdges(SU)) {
2251 SUnit *SuccSU = OE.getDst();
2252 if (!OE.isAntiDep())
2253 continue;
2254 if (S && S->count(SuccSU) == 0)
2255 continue;
2256 if (NodeOrder.count(SuccSU) == 0)
2257 Preds.insert(SuccSU);
2258 }
2259 }
2260 return !Preds.empty();
2261}
2262
2263/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2264/// as the successors of the elements of NodeOrder that are not also in
2265/// NodeOrder.
2268 const NodeSet *S = nullptr) {
2269 Succs.clear();
2270
2271 for (SUnit *SU : NodeOrder) {
2272 for (const auto &OE : DDG->getOutEdges(SU)) {
2273 SUnit *SuccSU = OE.getDst();
2274 if (S && S->count(SuccSU) == 0)
2275 continue;
2276 if (OE.ignoreDependence(false))
2277 continue;
2278 if (NodeOrder.count(SuccSU) == 0)
2279 Succs.insert(SuccSU);
2280 }
2281
2282 // FIXME: The following loop-carried dependencies may also need to be
2283 // considered.
2284 // - Physical register dependnecies (true-dependnece and WAW).
2285 // - Memory dependencies.
2286 for (const auto &IE : DDG->getInEdges(SU)) {
2287 SUnit *PredSU = IE.getSrc();
2288 if (!IE.isAntiDep())
2289 continue;
2290 if (S && S->count(PredSU) == 0)
2291 continue;
2292 if (NodeOrder.count(PredSU) == 0)
2293 Succs.insert(PredSU);
2294 }
2295 }
2296 return !Succs.empty();
2297}
2298
2299/// Return true if there is a path from the specified node to any of the nodes
2300/// in DestNodes. Keep track and return the nodes in any path.
2301static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2302 SetVector<SUnit *> &DestNodes,
2303 SetVector<SUnit *> &Exclude,
2304 SmallPtrSet<SUnit *, 8> &Visited,
2305 SwingSchedulerDDG *DDG) {
2306 if (Cur->isBoundaryNode())
2307 return false;
2308 if (Exclude.contains(Cur))
2309 return false;
2310 if (DestNodes.contains(Cur))
2311 return true;
2312 if (!Visited.insert(Cur).second)
2313 return Path.contains(Cur);
2314 bool FoundPath = false;
2315 for (const auto &OE : DDG->getOutEdges(Cur))
2316 if (!OE.ignoreDependence(false))
2317 FoundPath |=
2318 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2319 for (const auto &IE : DDG->getInEdges(Cur))
2320 if (IE.isAntiDep() && IE.getDistance() == 0)
2321 FoundPath |=
2322 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2323 if (FoundPath)
2324 Path.insert(Cur);
2325 return FoundPath;
2326}
2327
2328/// Compute the live-out registers for the instructions in a node-set.
2329/// The live-out registers are those that are defined in the node-set,
2330/// but not used. Except for use operands of Phis.
2332 NodeSet &NS) {
2337 for (SUnit *SU : NS) {
2338 const MachineInstr *MI = SU->getInstr();
2339 if (MI->isPHI())
2340 continue;
2341 for (const MachineOperand &MO : MI->all_uses()) {
2342 Register Reg = MO.getReg();
2343 if (Reg.isVirtual())
2344 Uses.insert(VirtRegOrUnit(Reg));
2345 else if (MRI.isAllocatable(Reg))
2346 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2347 Uses.insert(VirtRegOrUnit(Unit));
2348 }
2349 }
2350 for (SUnit *SU : NS)
2351 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2352 if (!MO.isDead()) {
2353 Register Reg = MO.getReg();
2354 if (Reg.isVirtual()) {
2355 if (!Uses.count(VirtRegOrUnit(Reg)))
2356 LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
2358 } else if (MRI.isAllocatable(Reg)) {
2359 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2360 if (!Uses.count(VirtRegOrUnit(Unit)))
2361 LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
2363 }
2364 }
2365 RPTracker.addLiveRegs(LiveOutRegs);
2366}
2367
2368/// A heuristic to filter nodes in recurrent node-sets if the register
2369/// pressure of a set is too high.
2370void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2371 for (auto &NS : NodeSets) {
2372 // Skip small node-sets since they won't cause register pressure problems.
2373 if (NS.size() <= 2)
2374 continue;
2375 IntervalPressure RecRegPressure;
2376 RegPressureTracker RecRPTracker(RecRegPressure);
2377 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2378 computeLiveOuts(MF, RecRPTracker, NS);
2379 RecRPTracker.closeBottom();
2380
2381 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2382 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2383 return A->NodeNum > B->NodeNum;
2384 });
2385
2386 for (auto &SU : SUnits) {
2387 // Since we're computing the register pressure for a subset of the
2388 // instructions in a block, we need to set the tracker for each
2389 // instruction in the node-set. The tracker is set to the instruction
2390 // just after the one we're interested in.
2392 RecRPTracker.setPos(std::next(CurInstI));
2393
2394 RegPressureDelta RPDelta;
2395 ArrayRef<PressureChange> CriticalPSets;
2396 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2397 CriticalPSets,
2398 RecRegPressure.MaxSetPressure);
2399 if (RPDelta.Excess.isValid()) {
2400 LLVM_DEBUG(
2401 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2402 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2403 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2404 NS.setExceedPressure(SU);
2405 break;
2406 }
2407 RecRPTracker.recede();
2408 }
2409 }
2410}
2411
2412/// A heuristic to colocate node sets that have the same set of
2413/// successors.
2414void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2415 unsigned Colocate = 0;
2416 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2417 NodeSet &N1 = NodeSets[i];
2418 SmallSetVector<SUnit *, 8> S1;
2419 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2420 continue;
2421 for (int j = i + 1; j < e; ++j) {
2422 NodeSet &N2 = NodeSets[j];
2423 if (N1.compareRecMII(N2) != 0)
2424 continue;
2425 SmallSetVector<SUnit *, 8> S2;
2426 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2427 continue;
2428 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2429 N1.setColocate(++Colocate);
2430 N2.setColocate(Colocate);
2431 break;
2432 }
2433 }
2434 }
2435}
2436
2437/// Check if the existing node-sets are profitable. If not, then ignore the
2438/// recurrent node-sets, and attempt to schedule all nodes together. This is
2439/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2440/// then it's best to try to schedule all instructions together instead of
2441/// starting with the recurrent node-sets.
2442void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2443 // Look for loops with a large MII.
2444 if (MII < 17)
2445 return;
2446 // Check if the node-set contains only a simple add recurrence.
2447 for (auto &NS : NodeSets) {
2448 if (NS.getRecMII() > 2)
2449 return;
2450 if (NS.getMaxDepth() > MII)
2451 return;
2452 }
2453 NodeSets.clear();
2454 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2455}
2456
2457/// Add the nodes that do not belong to a recurrence set into groups
2458/// based upon connected components.
2459void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2460 SetVector<SUnit *> NodesAdded;
2461 SmallPtrSet<SUnit *, 8> Visited;
2462 // Add the nodes that are on a path between the previous node sets and
2463 // the current node set.
2464 for (NodeSet &I : NodeSets) {
2465 SmallSetVector<SUnit *, 8> N;
2466 // Add the nodes from the current node set to the previous node set.
2467 if (succ_L(I, N, DDG.get())) {
2468 SetVector<SUnit *> Path;
2469 for (SUnit *NI : N) {
2470 Visited.clear();
2471 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2472 }
2473 if (!Path.empty())
2474 I.insert(Path.begin(), Path.end());
2475 }
2476 // Add the nodes from the previous node set to the current node set.
2477 N.clear();
2478 if (succ_L(NodesAdded, N, DDG.get())) {
2479 SetVector<SUnit *> Path;
2480 for (SUnit *NI : N) {
2481 Visited.clear();
2482 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2483 }
2484 if (!Path.empty())
2485 I.insert(Path.begin(), Path.end());
2486 }
2487 NodesAdded.insert_range(I);
2488 }
2489
2490 // Create a new node set with the connected nodes of any successor of a node
2491 // in a recurrent set.
2492 NodeSet NewSet;
2493 SmallSetVector<SUnit *, 8> N;
2494 if (succ_L(NodesAdded, N, DDG.get()))
2495 for (SUnit *I : N)
2496 addConnectedNodes(I, NewSet, NodesAdded);
2497 if (!NewSet.empty())
2498 NodeSets.push_back(NewSet);
2499
2500 // Create a new node set with the connected nodes of any predecessor of a node
2501 // in a recurrent set.
2502 NewSet.clear();
2503 if (pred_L(NodesAdded, N, DDG.get()))
2504 for (SUnit *I : N)
2505 addConnectedNodes(I, NewSet, NodesAdded);
2506 if (!NewSet.empty())
2507 NodeSets.push_back(NewSet);
2508
2509 // Create new nodes sets with the connected nodes any remaining node that
2510 // has no predecessor.
2511 for (SUnit &SU : SUnits) {
2512 if (NodesAdded.count(&SU) == 0) {
2513 NewSet.clear();
2514 addConnectedNodes(&SU, NewSet, NodesAdded);
2515 if (!NewSet.empty())
2516 NodeSets.push_back(NewSet);
2517 }
2518 }
2519}
2520
2521/// Add the node to the set, and add all of its connected nodes to the set.
2522void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2523 SetVector<SUnit *> &NodesAdded) {
2524 NewSet.insert(SU);
2525 NodesAdded.insert(SU);
2526 for (auto &OE : DDG->getOutEdges(SU)) {
2527 SUnit *Successor = OE.getDst();
2528 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2529 NodesAdded.count(Successor) == 0)
2530 addConnectedNodes(Successor, NewSet, NodesAdded);
2531 }
2532 for (auto &IE : DDG->getInEdges(SU)) {
2533 SUnit *Predecessor = IE.getSrc();
2534 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2535 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2536 }
2537}
2538
2539/// Return true if Set1 contains elements in Set2. The elements in common
2540/// are returned in a different container.
2541static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2543 Result.clear();
2544 for (SUnit *SU : Set1) {
2545 if (Set2.count(SU) != 0)
2546 Result.insert(SU);
2547 }
2548 return !Result.empty();
2549}
2550
2551/// Merge the recurrence node sets that have the same initial node.
2552void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2553 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2554 ++I) {
2555 NodeSet &NI = *I;
2556 for (NodeSetType::iterator J = I + 1; J != E;) {
2557 NodeSet &NJ = *J;
2558 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2559 if (NJ.compareRecMII(NI) > 0)
2560 NI.setRecMII(NJ.getRecMII());
2561 for (SUnit *SU : *J)
2562 I->insert(SU);
2563 NodeSets.erase(J);
2564 E = NodeSets.end();
2565 } else {
2566 ++J;
2567 }
2568 }
2569 }
2570}
2571
2572/// Remove nodes that have been scheduled in previous NodeSets.
2573void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2574 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2575 ++I)
2576 for (NodeSetType::iterator J = I + 1; J != E;) {
2577 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2578
2579 if (J->empty()) {
2580 NodeSets.erase(J);
2581 E = NodeSets.end();
2582 } else {
2583 ++J;
2584 }
2585 }
2586}
2587
2588/// Compute an ordered list of the dependence graph nodes, which
2589/// indicates the order that the nodes will be scheduled. This is a
2590/// two-level algorithm. First, a partial order is created, which
2591/// consists of a list of sets ordered from highest to lowest priority.
2592void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2593 SmallSetVector<SUnit *, 8> R;
2594 NodeOrder.clear();
2595
2596 for (auto &Nodes : NodeSets) {
2597 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2598 OrderKind Order;
2599 SmallSetVector<SUnit *, 8> N;
2600 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2601 R.insert_range(N);
2602 Order = BottomUp;
2603 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2604 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2605 llvm::set_is_subset(N, Nodes)) {
2606 R.insert_range(N);
2607 Order = TopDown;
2608 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2609 } else if (isIntersect(N, Nodes, R)) {
2610 // If some of the successors are in the existing node-set, then use the
2611 // top-down ordering.
2612 Order = TopDown;
2613 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2614 } else if (NodeSets.size() == 1) {
2615 for (const auto &N : Nodes)
2616 if (N->Succs.size() == 0)
2617 R.insert(N);
2618 Order = BottomUp;
2619 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2620 } else {
2621 // Find the node with the highest ASAP.
2622 SUnit *maxASAP = nullptr;
2623 for (SUnit *SU : Nodes) {
2624 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2625 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2626 maxASAP = SU;
2627 }
2628 R.insert(maxASAP);
2629 Order = BottomUp;
2630 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2631 }
2632
2633 while (!R.empty()) {
2634 if (Order == TopDown) {
2635 // Choose the node with the maximum height. If more than one, choose
2636 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2637 // choose the node with the lowest MOV.
2638 while (!R.empty()) {
2639 SUnit *maxHeight = nullptr;
2640 for (SUnit *I : R) {
2641 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2642 maxHeight = I;
2643 else if (getHeight(I) == getHeight(maxHeight) &&
2644 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2645 maxHeight = I;
2646 else if (getHeight(I) == getHeight(maxHeight) &&
2647 getZeroLatencyHeight(I) ==
2648 getZeroLatencyHeight(maxHeight) &&
2649 getMOV(I) < getMOV(maxHeight))
2650 maxHeight = I;
2651 }
2652 NodeOrder.insert(maxHeight);
2653 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2654 R.remove(maxHeight);
2655 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2656 SUnit *SU = OE.getDst();
2657 if (Nodes.count(SU) == 0)
2658 continue;
2659 if (NodeOrder.contains(SU))
2660 continue;
2661 if (OE.ignoreDependence(false))
2662 continue;
2663 R.insert(SU);
2664 }
2665
2666 // FIXME: The following loop-carried dependencies may also need to be
2667 // considered.
2668 // - Physical register dependnecies (true-dependnece and WAW).
2669 // - Memory dependencies.
2670 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2671 SUnit *SU = IE.getSrc();
2672 if (!IE.isAntiDep())
2673 continue;
2674 if (Nodes.count(SU) == 0)
2675 continue;
2676 if (NodeOrder.contains(SU))
2677 continue;
2678 R.insert(SU);
2679 }
2680 }
2681 Order = BottomUp;
2682 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2683 SmallSetVector<SUnit *, 8> N;
2684 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2685 R.insert_range(N);
2686 } else {
2687 // Choose the node with the maximum depth. If more than one, choose
2688 // the node with the maximum ZeroLatencyDepth. If still more than one,
2689 // choose the node with the lowest MOV.
2690 while (!R.empty()) {
2691 SUnit *maxDepth = nullptr;
2692 for (SUnit *I : R) {
2693 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2694 maxDepth = I;
2695 else if (getDepth(I) == getDepth(maxDepth) &&
2696 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2697 maxDepth = I;
2698 else if (getDepth(I) == getDepth(maxDepth) &&
2699 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2700 getMOV(I) < getMOV(maxDepth))
2701 maxDepth = I;
2702 }
2703 NodeOrder.insert(maxDepth);
2704 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2705 R.remove(maxDepth);
2706 if (Nodes.isExceedSU(maxDepth)) {
2707 Order = TopDown;
2708 R.clear();
2709 R.insert(Nodes.getNode(0));
2710 break;
2711 }
2712 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2713 SUnit *SU = IE.getSrc();
2714 if (Nodes.count(SU) == 0)
2715 continue;
2716 if (NodeOrder.contains(SU))
2717 continue;
2718 R.insert(SU);
2719 }
2720
2721 // FIXME: The following loop-carried dependencies may also need to be
2722 // considered.
2723 // - Physical register dependnecies (true-dependnece and WAW).
2724 // - Memory dependencies.
2725 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2726 SUnit *SU = OE.getDst();
2727 if (!OE.isAntiDep())
2728 continue;
2729 if (Nodes.count(SU) == 0)
2730 continue;
2731 if (NodeOrder.contains(SU))
2732 continue;
2733 R.insert(SU);
2734 }
2735 }
2736 Order = TopDown;
2737 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2738 SmallSetVector<SUnit *, 8> N;
2739 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2740 R.insert_range(N);
2741 }
2742 }
2743 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2744 }
2745
2746 LLVM_DEBUG({
2747 dbgs() << "Node order: ";
2748 for (SUnit *I : NodeOrder)
2749 dbgs() << " " << I->NodeNum << " ";
2750 dbgs() << "\n";
2751 });
2752}
2753
2754/// Process the nodes in the computed order and create the pipelined schedule
2755/// of the instructions, if possible. Return true if a schedule is found.
2756bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2757
2758 if (NodeOrder.empty()){
2759 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2760 return false;
2761 }
2762
2763 bool scheduleFound = false;
2764 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2765 if (LimitRegPressure) {
2766 HRPDetector =
2767 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2768 HRPDetector->init(RegClassInfo);
2769 }
2770 // Keep increasing II until a valid schedule is found.
2771 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2772 Schedule.reset();
2773 Schedule.setInitiationInterval(II);
2774 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2775
2778 do {
2779 SUnit *SU = *NI;
2780
2781 // Compute the schedule time for the instruction, which is based
2782 // upon the scheduled time for any predecessors/successors.
2783 int EarlyStart = INT_MIN;
2784 int LateStart = INT_MAX;
2785 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2786 LLVM_DEBUG({
2787 dbgs() << "\n";
2788 dbgs() << "Inst (" << SU->NodeNum << ") ";
2789 SU->getInstr()->dump();
2790 dbgs() << "\n";
2791 });
2792 LLVM_DEBUG(
2793 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2794
2795 if (EarlyStart > LateStart)
2796 scheduleFound = false;
2797 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2798 scheduleFound =
2799 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2800 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2801 scheduleFound =
2802 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2803 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2804 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2805 // When scheduling a Phi it is better to start at the late cycle and
2806 // go backwards. The default order may insert the Phi too far away
2807 // from its first dependence.
2808 // Also, do backward search when all scheduled predecessors are
2809 // loop-carried output/order dependencies. Empirically, there are also
2810 // cases where scheduling becomes possible with backward search.
2811 if (SU->getInstr()->isPHI() ||
2812 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2813 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2814 else
2815 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2816 } else {
2817 int FirstCycle = Schedule.getFirstCycle();
2818 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2819 FirstCycle + getASAP(SU) + II - 1, II);
2820 }
2821
2822 // Even if we find a schedule, make sure the schedule doesn't exceed the
2823 // allowable number of stages. We keep trying if this happens.
2824 if (scheduleFound)
2825 if (SwpMaxStages > -1 &&
2826 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2827 scheduleFound = false;
2828
2829 LLVM_DEBUG({
2830 if (!scheduleFound)
2831 dbgs() << "\tCan't schedule\n";
2832 });
2833 } while (++NI != NE && scheduleFound);
2834
2835 // If a schedule is found, validate it against the validation-only
2836 // dependencies.
2837 if (scheduleFound)
2838 scheduleFound = DDG->isValidSchedule(Schedule);
2839
2840 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2841 if (scheduleFound)
2842 scheduleFound =
2843 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2844
2845 // If a schedule is found, check if it is a valid schedule too.
2846 if (scheduleFound)
2847 scheduleFound = Schedule.isValidSchedule(this);
2848
2849 // If a schedule was found and the option is enabled, check if the schedule
2850 // might generate additional register spills/fills.
2851 if (scheduleFound && LimitRegPressure)
2852 scheduleFound =
2853 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2854 }
2855
2856 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2857 << " (II=" << Schedule.getInitiationInterval()
2858 << ")\n");
2859
2860 if (scheduleFound) {
2861 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2862 if (!scheduleFound)
2863 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2864 }
2865
2866 if (scheduleFound) {
2867 Schedule.finalizeSchedule(this);
2868 Pass.ORE->emit([&]() {
2869 return MachineOptimizationRemarkAnalysis(
2870 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2871 << "Schedule found with Initiation Interval: "
2872 << ore::NV("II", Schedule.getInitiationInterval())
2873 << ", MaxStageCount: "
2874 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2875 });
2876 } else
2877 Schedule.reset();
2878
2879 return scheduleFound && Schedule.getMaxStageCount() > 0;
2880}
2881
2883 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2884 Register Result;
2885 for (const MachineOperand &Use : MI.all_uses()) {
2886 Register Reg = Use.getReg();
2887 if (!Reg.isVirtual())
2888 return Register();
2889 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2890 continue;
2891 if (Result)
2892 return Register();
2893 Result = Reg;
2894 }
2895 return Result;
2896}
2897
2898/// When Op is a value that is incremented recursively in a loop and there is a
2899/// unique instruction that increments it, returns true and sets Value.
2901 if (!Op.isReg() || !Op.getReg().isVirtual())
2902 return false;
2903
2904 Register OrgReg = Op.getReg();
2905 Register CurReg = OrgReg;
2906 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2907 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2908
2909 const TargetInstrInfo *TII =
2910 LoopBB->getParent()->getSubtarget().getInstrInfo();
2911 const TargetRegisterInfo *TRI =
2912 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2913
2914 MachineInstr *Phi = nullptr;
2915 MachineInstr *Increment = nullptr;
2916
2917 // Traverse definitions until it reaches Op or an instruction that does not
2918 // satisfy the condition.
2919 // Acceptable example:
2920 // bb.0:
2921 // %0 = PHI %3, %bb.0, ...
2922 // %2 = ADD %0, Value
2923 // ... = LOAD %2(Op)
2924 // %3 = COPY %2
2925 while (true) {
2926 if (!CurReg.isValid() || !CurReg.isVirtual())
2927 return false;
2928 MachineInstr *Def = MRI.getVRegDef(CurReg);
2929 if (Def->getParent() != LoopBB)
2930 return false;
2931
2932 if (Def->isCopy()) {
2933 // Ignore copy instructions unless they contain subregisters
2934 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2935 return false;
2936 CurReg = Def->getOperand(1).getReg();
2937 } else if (Def->isPHI()) {
2938 // There must be just one Phi
2939 if (Phi)
2940 return false;
2941 Phi = Def;
2942 CurReg = getLoopPhiReg(*Def, LoopBB);
2943 } else if (TII->getIncrementValue(*Def, Value)) {
2944 // Potentially a unique increment
2945 if (Increment)
2946 // Multiple increments exist
2947 return false;
2948
2949 const MachineOperand *BaseOp;
2950 int64_t Offset;
2951 bool OffsetIsScalable;
2952 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2953 TRI)) {
2954 // Pre/post increment instruction
2955 CurReg = BaseOp->getReg();
2956 } else {
2957 // If only one of the operands is defined within the loop, it is assumed
2958 // to be an incremented value.
2959 CurReg = findUniqueOperandDefinedInLoop(*Def);
2960 if (!CurReg.isValid())
2961 return false;
2962 }
2963 Increment = Def;
2964 } else {
2965 return false;
2966 }
2967 if (CurReg == OrgReg)
2968 break;
2969 }
2970
2971 if (!Phi || !Increment)
2972 return false;
2973
2974 return true;
2975}
2976
2977/// Return true if we can compute the amount the instruction changes
2978/// during each iteration. Set Delta to the amount of the change.
2979bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2980 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2981 const MachineOperand *BaseOp;
2982 int64_t Offset;
2983 bool OffsetIsScalable;
2984 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2985 return false;
2986
2987 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2988 if (OffsetIsScalable)
2989 return false;
2990
2991 if (!BaseOp->isReg())
2992 return false;
2993
2994 return findLoopIncrementValue(*BaseOp, Delta);
2995}
2996
2997/// Check if we can change the instruction to use an offset value from the
2998/// previous iteration. If so, return true and set the base and offset values
2999/// so that we can rewrite the load, if necessary.
3000/// v1 = Phi(v0, v3)
3001/// v2 = load v1, 0
3002/// v3 = post_store v1, 4, x
3003/// This function enables the load to be rewritten as v2 = load v3, 4.
3004bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3005 unsigned &BasePos,
3006 unsigned &OffsetPos,
3007 Register &NewBase,
3008 int64_t &Offset) {
3009 // Get the load instruction.
3010 if (TII->isPostIncrement(*MI))
3011 return false;
3012 unsigned BasePosLd, OffsetPosLd;
3013 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3014 return false;
3015 Register BaseReg = MI->getOperand(BasePosLd).getReg();
3016
3017 // Look for the Phi instruction.
3018 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3019 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3020 if (!Phi || !Phi->isPHI())
3021 return false;
3022 // Get the register defined in the loop block.
3023 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3024 if (!PrevReg)
3025 return false;
3026
3027 // Check for the post-increment load/store instruction.
3028 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3029 if (!PrevDef || PrevDef == MI)
3030 return false;
3031
3032 if (!TII->isPostIncrement(*PrevDef))
3033 return false;
3034
3035 unsigned BasePos1 = 0, OffsetPos1 = 0;
3036 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3037 return false;
3038
3039 // Make sure that the instructions do not access the same memory location in
3040 // the next iteration.
3041 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3042 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3043 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3044 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3045 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3046 MF.deleteMachineInstr(NewMI);
3047 if (!Disjoint)
3048 return false;
3049
3050 // Set the return value once we determine that we return true.
3051 BasePos = BasePosLd;
3052 OffsetPos = OffsetPosLd;
3053 NewBase = PrevReg;
3054 Offset = StoreOffset;
3055 return true;
3056}
3057
3058/// Apply changes to the instruction if needed. The changes are need
3059/// to improve the scheduling and depend up on the final schedule.
3061 SMSchedule &Schedule) {
3062 SUnit *SU = getSUnit(MI);
3064 InstrChanges.find(SU);
3065 if (It != InstrChanges.end()) {
3066 std::pair<Register, int64_t> RegAndOffset = It->second;
3067 unsigned BasePos, OffsetPos;
3068 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3069 return;
3070 Register BaseReg = MI->getOperand(BasePos).getReg();
3071 MachineInstr *LoopDef = findDefInLoop(BaseReg);
3072 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3073 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3074 int BaseStageNum = Schedule.stageScheduled(SU);
3075 int BaseCycleNum = Schedule.cycleScheduled(SU);
3076 if (BaseStageNum < DefStageNum) {
3077 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3078 int OffsetDiff = DefStageNum - BaseStageNum;
3079 if (DefCycleNum < BaseCycleNum) {
3080 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3081 if (OffsetDiff > 0)
3082 --OffsetDiff;
3083 }
3084 int64_t NewOffset =
3085 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3086 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3087 SU->setInstr(NewMI);
3088 MISUnitMap[NewMI] = SU;
3089 NewMIs[MI] = NewMI;
3090 }
3091 }
3092}
3093
3094/// Return the instruction in the loop that defines the register.
3095/// If the definition is a Phi, then follow the Phi operand to
3096/// the instruction in the loop.
3097MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3099 MachineInstr *Def = MRI.getVRegDef(Reg);
3100 while (Def->isPHI()) {
3101 if (!Visited.insert(Def).second)
3102 break;
3103 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3104 if (Def->getOperand(i + 1).getMBB() == BB) {
3105 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3106 break;
3107 }
3108 }
3109 return Def;
3110}
3111
3112/// Return false if there is no overlap between the region accessed by BaseMI in
3113/// an iteration and the region accessed by OtherMI in subsequent iterations.
3115 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3116 int DeltaB, DeltaO, Delta;
3117 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3118 DeltaB != DeltaO)
3119 return true;
3120 Delta = DeltaB;
3121
3122 const MachineOperand *BaseOpB, *BaseOpO;
3123 int64_t OffsetB, OffsetO;
3124 bool OffsetBIsScalable, OffsetOIsScalable;
3125 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3126 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3127 OffsetBIsScalable, TRI) ||
3128 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3129 OffsetOIsScalable, TRI))
3130 return true;
3131
3132 if (OffsetBIsScalable || OffsetOIsScalable)
3133 return true;
3134
3135 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3136 // Pass cases with different base operands but same initial values.
3137 // Typically for when pre/post increment is used.
3138
3139 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3140 return true;
3141 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3142 if (!RegB.isVirtual() || !RegO.isVirtual())
3143 return true;
3144
3145 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3146 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3147 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3148 return true;
3149
3150 Register InitValB;
3151 Register LoopValB;
3152 Register InitValO;
3153 Register LoopValO;
3154 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3155 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3156 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3157 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3158
3159 if (!InitDefB->isIdenticalTo(*InitDefO))
3160 return true;
3161 }
3162
3163 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3164 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3165
3166 // This is the main test, which checks the offset values and the loop
3167 // increment value to determine if the accesses may be loop carried.
3168 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3169 return true;
3170
3171 LLVM_DEBUG({
3172 dbgs() << "Overlap check:\n";
3173 dbgs() << " BaseMI: ";
3174 BaseMI->dump();
3175 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3176 << ", Len: " << AccessSizeB.getValue() << "\n";
3177 dbgs() << " OtherMI: ";
3178 OtherMI->dump();
3179 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3180 << ", Len: " << AccessSizeO.getValue() << "\n";
3181 });
3182
3183 // Excessive overlap may be detected in strided patterns.
3184 // For example, the memory addresses of the store and the load in
3185 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3186 // are assumed to overlap.
3187 if (Delta < 0) {
3188 int64_t BaseMinAddr = OffsetB;
3189 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3190 if (BaseMinAddr > OhterNextIterMaxAddr) {
3191 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3192 return false;
3193 }
3194 } else {
3195 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3196 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3197 if (BaseMaxAddr < OtherNextIterMinAddr) {
3198 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3199 return false;
3200 }
3201 }
3202 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3203 return true;
3204}
3205
3206/// Return true for an order or output dependence that is loop carried
3207/// potentially. A dependence is loop carried if the destination defines a value
3208/// that may be used or defined by the source in a subsequent iteration.
3210 const SwingSchedulerDDGEdge &Edge) const {
3211 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3212 Edge.getDst()->isBoundaryNode())
3213 return false;
3214
3216 return true;
3217
3218 if (Edge.isOutputDep())
3219 return true;
3220
3221 MachineInstr *SI = Edge.getSrc()->getInstr();
3222 MachineInstr *DI = Edge.getDst()->getInstr();
3223 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3224
3225 // Assume ordered loads and stores may have a loop carried dependence.
3226 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3227 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3228 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3229 return true;
3230
3231 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3232 return false;
3233
3234 // The conservative assumption is that a dependence between memory operations
3235 // may be loop carried. The following code checks when it can be proved that
3236 // there is no loop carried dependence.
3237 return mayOverlapInLaterIter(DI, SI);
3238}
3239
3240void SwingSchedulerDAG::postProcessDAG() {
3241 for (auto &M : Mutations)
3242 M->apply(this);
3243}
3244
3245/// Try to schedule the node at the specified StartCycle and continue
3246/// until the node is schedule or the EndCycle is reached. This function
3247/// returns true if the node is scheduled. This routine may search either
3248/// forward or backward for a place to insert the instruction based upon
3249/// the relative values of StartCycle and EndCycle.
3250bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3251 bool forward = true;
3252 LLVM_DEBUG({
3253 dbgs() << "Trying to insert node between " << StartCycle << " and "
3254 << EndCycle << " II: " << II << "\n";
3255 });
3256 if (StartCycle > EndCycle)
3257 forward = false;
3258
3259 // The terminating condition depends on the direction.
3260 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3261 for (int curCycle = StartCycle; curCycle != termCycle;
3262 forward ? ++curCycle : --curCycle) {
3263
3264 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3265 ProcItinResources.canReserveResources(*SU, curCycle)) {
3266 LLVM_DEBUG({
3267 dbgs() << "\tinsert at cycle " << curCycle << " ";
3268 SU->getInstr()->dump();
3269 });
3270
3271 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3272 ProcItinResources.reserveResources(*SU, curCycle);
3273 ScheduledInstrs[curCycle].push_back(SU);
3274 InstrToCycle.insert(std::make_pair(SU, curCycle));
3275 if (curCycle > LastCycle)
3276 LastCycle = curCycle;
3277 if (curCycle < FirstCycle)
3278 FirstCycle = curCycle;
3279 return true;
3280 }
3281 LLVM_DEBUG({
3282 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3283 SU->getInstr()->dump();
3284 });
3285 }
3286 return false;
3287}
3288
3289// Return the cycle of the earliest scheduled instruction in the chain.
3291 const SwingSchedulerDDG *DDG) {
3294 Worklist.push_back(Dep);
3295 int EarlyCycle = INT_MAX;
3296 while (!Worklist.empty()) {
3297 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3298 SUnit *PrevSU = Cur.getSrc();
3299 if (Visited.count(PrevSU))
3300 continue;
3301 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3302 if (it == InstrToCycle.end())
3303 continue;
3304 EarlyCycle = std::min(EarlyCycle, it->second);
3305 for (const auto &IE : DDG->getInEdges(PrevSU))
3306 if (IE.isOrderDep() || IE.isOutputDep())
3307 Worklist.push_back(IE);
3308 Visited.insert(PrevSU);
3309 }
3310 return EarlyCycle;
3311}
3312
3313// Return the cycle of the latest scheduled instruction in the chain.
3315 const SwingSchedulerDDG *DDG) {
3318 Worklist.push_back(Dep);
3319 int LateCycle = INT_MIN;
3320 while (!Worklist.empty()) {
3321 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3322 SUnit *SuccSU = Cur.getDst();
3323 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3324 continue;
3325 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3326 if (it == InstrToCycle.end())
3327 continue;
3328 LateCycle = std::max(LateCycle, it->second);
3329 for (const auto &OE : DDG->getOutEdges(SuccSU))
3330 if (OE.isOrderDep() || OE.isOutputDep())
3331 Worklist.push_back(OE);
3332 Visited.insert(SuccSU);
3333 }
3334 return LateCycle;
3335}
3336
3337/// If an instruction has a use that spans multiple iterations, then
3338/// return true. These instructions are characterized by having a back-ege
3339/// to a Phi, which contains a reference to another Phi.
3341 for (auto &P : SU->Preds)
3342 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3343 for (auto &S : P.getSUnit()->Succs)
3344 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3345 return P.getSUnit();
3346 return nullptr;
3347}
3348
3349/// Compute the scheduling start slot for the instruction. The start slot
3350/// depends on any predecessor or successor nodes scheduled already.
3351void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3352 int II, SwingSchedulerDAG *DAG) {
3353 const SwingSchedulerDDG *DDG = DAG->getDDG();
3354
3355 // Iterate over each instruction that has been scheduled already. The start
3356 // slot computation depends on whether the previously scheduled instruction
3357 // is a predecessor or successor of the specified instruction.
3358 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3359 for (SUnit *I : getInstructions(cycle)) {
3360 for (const auto &IE : DDG->getInEdges(SU)) {
3361 if (IE.getSrc() == I) {
3362 // FIXME: Add reverse edge to `DDG` instead of calling
3363 // `isLoopCarriedDep`
3364 if (DAG->isLoopCarriedDep(IE)) {
3365 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3366 *MinLateStart = std::min(*MinLateStart, End);
3367 }
3368 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3369 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3370 }
3371 }
3372
3373 for (const auto &OE : DDG->getOutEdges(SU)) {
3374 if (OE.getDst() == I) {
3375 // FIXME: Add reverse edge to `DDG` instead of calling
3376 // `isLoopCarriedDep`
3377 if (DAG->isLoopCarriedDep(OE)) {
3378 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3379 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3380 }
3381 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3382 *MinLateStart = std::min(*MinLateStart, LateStart);
3383 }
3384 }
3385
3386 SUnit *BE = multipleIterations(I, DAG);
3387 for (const auto &Dep : SU->Preds) {
3388 // For instruction that requires multiple iterations, make sure that
3389 // the dependent instruction is not scheduled past the definition.
3390 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3391 !SU->isPred(I))
3392 *MinLateStart = std::min(*MinLateStart, cycle);
3393 }
3394 }
3395 }
3396}
3397
3398/// Order the instructions within a cycle so that the definitions occur
3399/// before the uses. Returns true if the instruction is added to the start
3400/// of the list, or false if added to the end.
3402 std::deque<SUnit *> &Insts) const {
3403 MachineInstr *MI = SU->getInstr();
3404 bool OrderBeforeUse = false;
3405 bool OrderAfterDef = false;
3406 bool OrderBeforeDef = false;
3407 unsigned MoveDef = 0;
3408 unsigned MoveUse = 0;
3409 int StageInst1 = stageScheduled(SU);
3410 const SwingSchedulerDDG *DDG = SSD->getDDG();
3411
3412 unsigned Pos = 0;
3413 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3414 ++I, ++Pos) {
3415 for (MachineOperand &MO : MI->operands()) {
3416 if (!MO.isReg() || !MO.getReg().isVirtual())
3417 continue;
3418
3419 Register Reg = MO.getReg();
3420 unsigned BasePos, OffsetPos;
3421 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3422 if (MI->getOperand(BasePos).getReg() == Reg)
3423 if (Register NewReg = SSD->getInstrBaseReg(SU))
3424 Reg = NewReg;
3425 bool Reads, Writes;
3426 std::tie(Reads, Writes) =
3427 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3428 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3429 OrderBeforeUse = true;
3430 if (MoveUse == 0)
3431 MoveUse = Pos;
3432 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3433 // Add the instruction after the scheduled instruction.
3434 OrderAfterDef = true;
3435 MoveDef = Pos;
3436 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3437 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3438 OrderBeforeUse = true;
3439 if (MoveUse == 0)
3440 MoveUse = Pos;
3441 } else {
3442 OrderAfterDef = true;
3443 MoveDef = Pos;
3444 }
3445 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3446 OrderBeforeUse = true;
3447 if (MoveUse == 0)
3448 MoveUse = Pos;
3449 if (MoveUse != 0) {
3450 OrderAfterDef = true;
3451 MoveDef = Pos - 1;
3452 }
3453 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3454 // Add the instruction before the scheduled instruction.
3455 OrderBeforeUse = true;
3456 if (MoveUse == 0)
3457 MoveUse = Pos;
3458 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3459 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3460 if (MoveUse == 0) {
3461 OrderBeforeDef = true;
3462 MoveUse = Pos;
3463 }
3464 }
3465 }
3466 // Check for order dependences between instructions. Make sure the source
3467 // is ordered before the destination.
3468 for (auto &OE : DDG->getOutEdges(SU)) {
3469 if (OE.getDst() != *I)
3470 continue;
3471 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3472 OrderBeforeUse = true;
3473 if (Pos < MoveUse)
3474 MoveUse = Pos;
3475 }
3476 // We did not handle HW dependences in previous for loop,
3477 // and we normally set Latency = 0 for Anti/Output deps,
3478 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3479 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3480 stageScheduled(*I) == StageInst1) {
3481 OrderBeforeUse = true;
3482 if ((MoveUse == 0) || (Pos < MoveUse))
3483 MoveUse = Pos;
3484 }
3485 }
3486 for (auto &IE : DDG->getInEdges(SU)) {
3487 if (IE.getSrc() != *I)
3488 continue;
3489 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3490 stageScheduled(*I) == StageInst1) {
3491 OrderAfterDef = true;
3492 MoveDef = Pos;
3493 }
3494 }
3495 }
3496
3497 // A circular dependence.
3498 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3499 OrderBeforeUse = false;
3500
3501 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3502 // to a loop-carried dependence.
3503 if (OrderBeforeDef)
3504 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3505
3506 // The uncommon case when the instruction order needs to be updated because
3507 // there is both a use and def.
3508 if (OrderBeforeUse && OrderAfterDef) {
3509 SUnit *UseSU = Insts.at(MoveUse);
3510 SUnit *DefSU = Insts.at(MoveDef);
3511 if (MoveUse > MoveDef) {
3512 Insts.erase(Insts.begin() + MoveUse);
3513 Insts.erase(Insts.begin() + MoveDef);
3514 } else {
3515 Insts.erase(Insts.begin() + MoveDef);
3516 Insts.erase(Insts.begin() + MoveUse);
3517 }
3518 orderDependence(SSD, UseSU, Insts);
3519 orderDependence(SSD, SU, Insts);
3520 orderDependence(SSD, DefSU, Insts);
3521 return;
3522 }
3523 // Put the new instruction first if there is a use in the list. Otherwise,
3524 // put it at the end of the list.
3525 if (OrderBeforeUse)
3526 Insts.push_front(SU);
3527 else
3528 Insts.push_back(SU);
3529}
3530
3531/// Return true if the scheduled Phi has a loop carried operand.
3533 MachineInstr &Phi) const {
3534 if (!Phi.isPHI())
3535 return false;
3536 assert(Phi.isPHI() && "Expecting a Phi.");
3537 SUnit *DefSU = SSD->getSUnit(&Phi);
3538 unsigned DefCycle = cycleScheduled(DefSU);
3539 int DefStage = stageScheduled(DefSU);
3540
3541 Register InitVal;
3542 Register LoopVal;
3543 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3544 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3545 if (!UseSU)
3546 return true;
3547 if (UseSU->getInstr()->isPHI())
3548 return true;
3549 unsigned LoopCycle = cycleScheduled(UseSU);
3550 int LoopStage = stageScheduled(UseSU);
3551 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3552}
3553
3554/// Return true if the instruction is a definition that is loop carried
3555/// and defines the use on the next iteration.
3556/// v1 = phi(v2, v3)
3557/// (Def) v3 = op v1
3558/// (MO) = v1
3559/// If MO appears before Def, then v1 and v3 may get assigned to the same
3560/// register.
3562 MachineInstr *Def,
3563 MachineOperand &MO) const {
3564 if (!MO.isReg())
3565 return false;
3566 if (Def->isPHI())
3567 return false;
3568 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3569 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3570 return false;
3571 if (!isLoopCarried(SSD, *Phi))
3572 return false;
3573 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3574 for (MachineOperand &DMO : Def->all_defs()) {
3575 if (DMO.getReg() == LoopReg)
3576 return true;
3577 }
3578 return false;
3579}
3580
3581/// Return true if all scheduled predecessors are loop-carried output/order
3582/// dependencies.
3584 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3585 for (const auto &IE : DDG->getInEdges(SU))
3586 if (InstrToCycle.count(IE.getSrc()))
3587 return false;
3588 return true;
3589}
3590
3591/// Determine transitive dependences of unpipelineable instructions
3594 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3595 SmallVector<SUnit *, 8> Worklist;
3596
3597 for (auto &SU : SSD->SUnits)
3598 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3599 Worklist.push_back(&SU);
3600
3601 const SwingSchedulerDDG *DDG = SSD->getDDG();
3602 while (!Worklist.empty()) {
3603 auto SU = Worklist.pop_back_val();
3604 if (DoNotPipeline.count(SU))
3605 continue;
3606 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3607 DoNotPipeline.insert(SU);
3608 for (const auto &IE : DDG->getInEdges(SU))
3609 Worklist.push_back(IE.getSrc());
3610
3611 // To preserve previous behavior and prevent regression
3612 // FIXME: Remove if this doesn't have significant impact on
3613 for (const auto &OE : DDG->getOutEdges(SU))
3614 if (OE.getDistance() == 1)
3615 Worklist.push_back(OE.getDst());
3616 }
3617 return DoNotPipeline;
3618}
3619
3620// Determine all instructions upon which any unpipelineable instruction depends
3621// and ensure that they are in stage 0. If unable to do so, return false.
3625
3626 int NewLastCycle = INT_MIN;
3627 for (SUnit &SU : SSD->SUnits) {
3628 if (!SU.isInstr())
3629 continue;
3630 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3631 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3632 continue;
3633 }
3634
3635 // Put the non-pipelined instruction as early as possible in the schedule
3636 int NewCycle = getFirstCycle();
3637 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3638 if (IE.getDistance() == 0)
3639 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3640
3641 // To preserve previous behavior and prevent regression
3642 // FIXME: Remove if this doesn't have significant impact on performance
3643 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3644 if (OE.getDistance() == 1)
3645 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3646
3647 int OldCycle = InstrToCycle[&SU];
3648 if (OldCycle != NewCycle) {
3649 InstrToCycle[&SU] = NewCycle;
3650 auto &OldS = getInstructions(OldCycle);
3651 llvm::erase(OldS, &SU);
3652 getInstructions(NewCycle).emplace_back(&SU);
3653 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3654 << ") is not pipelined; moving from cycle " << OldCycle
3655 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3656 }
3657
3658 // We traverse the SUs in the order of the original basic block. Computing
3659 // NewCycle in this order normally works fine because all dependencies
3660 // (except for loop-carried dependencies) don't violate the original order.
3661 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3662 // break it. That is, there may be exist an artificial dependency from
3663 // bottom to top. In such a case, NewCycle may become too large to be
3664 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3665 // following case:
3666 //
3667 // | Inst0 <-+
3668 // SU order | | artificial dep
3669 // | Inst1 --+
3670 // v
3671 //
3672 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3673 // Inst0 must be greater than or equal to N so that Inst0 is not be
3674 // scheduled at Stage 0. In such cases, we reject this schedule at this
3675 // time.
3676 // FIXME: The reason for this is the existence of artificial dependencies
3677 // that are contradict to the original SU order. If ignoring artificial
3678 // dependencies does not affect correctness, then it is better to ignore
3679 // them.
3680 if (FirstCycle + InitiationInterval <= NewCycle)
3681 return false;
3682
3683 NewLastCycle = std::max(NewLastCycle, NewCycle);
3684 }
3685 LastCycle = NewLastCycle;
3686 return true;
3687}
3688
3689// Check if the generated schedule is valid. This function checks if
3690// an instruction that uses a physical register is scheduled in a
3691// different stage than the definition. The pipeliner does not handle
3692// physical register values that may cross a basic block boundary.
3693// Furthermore, if a physical def/use pair is assigned to the same
3694// cycle, orderDependence does not guarantee def/use ordering, so that
3695// case should be considered invalid. (The test checks for both
3696// earlier and same-cycle use to be more robust.)
3698 for (SUnit &SU : SSD->SUnits) {
3699 if (!SU.hasPhysRegDefs)
3700 continue;
3701 int StageDef = stageScheduled(&SU);
3702 int CycleDef = InstrToCycle[&SU];
3703 assert(StageDef != -1 && "Instruction should have been scheduled.");
3704 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3705 SUnit *Dst = OE.getDst();
3706 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3707 if (OE.getReg().isPhysical()) {
3708 if (stageScheduled(Dst) != StageDef)
3709 return false;
3710 if (InstrToCycle[Dst] <= CycleDef)
3711 return false;
3712 }
3713 }
3714 }
3715 return true;
3716}
3717
3718/// A property of the node order in swing-modulo-scheduling is
3719/// that for nodes outside circuits the following holds:
3720/// none of them is scheduled after both a successor and a
3721/// predecessor.
3722/// The method below checks whether the property is met.
3723/// If not, debug information is printed and statistics information updated.
3724/// Note that we do not use an assert statement.
3725/// The reason is that although an invalid node order may prevent
3726/// the pipeliner from finding a pipelined schedule for arbitrary II,
3727/// it does not lead to the generation of incorrect code.
3728void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3729
3730 // a sorted vector that maps each SUnit to its index in the NodeOrder
3731 typedef std::pair<SUnit *, unsigned> UnitIndex;
3732 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3733
3734 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3735 Indices.push_back(std::make_pair(NodeOrder[i], i));
3736
3737 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3738 return std::get<0>(i1) < std::get<0>(i2);
3739 };
3740
3741 // sort, so that we can perform a binary search
3742 llvm::sort(Indices, CompareKey);
3743
3744 bool Valid = true;
3745 (void)Valid;
3746 // for each SUnit in the NodeOrder, check whether
3747 // it appears after both a successor and a predecessor
3748 // of the SUnit. If this is the case, and the SUnit
3749 // is not part of circuit, then the NodeOrder is not
3750 // valid.
3751 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3752 SUnit *SU = NodeOrder[i];
3753 unsigned Index = i;
3754
3755 bool PredBefore = false;
3756 bool SuccBefore = false;
3757
3758 SUnit *Succ;
3759 SUnit *Pred;
3760 (void)Succ;
3761 (void)Pred;
3762
3763 for (const auto &IE : DDG->getInEdges(SU)) {
3764 SUnit *PredSU = IE.getSrc();
3765 unsigned PredIndex = std::get<1>(
3766 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3767 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3768 PredBefore = true;
3769 Pred = PredSU;
3770 break;
3771 }
3772 }
3773
3774 for (const auto &OE : DDG->getOutEdges(SU)) {
3775 SUnit *SuccSU = OE.getDst();
3776 // Do not process a boundary node, it was not included in NodeOrder,
3777 // hence not in Indices either, call to std::lower_bound() below will
3778 // return Indices.end().
3779 if (SuccSU->isBoundaryNode())
3780 continue;
3781 unsigned SuccIndex = std::get<1>(
3782 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3783 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3784 SuccBefore = true;
3785 Succ = SuccSU;
3786 break;
3787 }
3788 }
3789
3790 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3791 // instructions in circuits are allowed to be scheduled
3792 // after both a successor and predecessor.
3793 bool InCircuit = llvm::any_of(
3794 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3795 if (InCircuit)
3796 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3797 else {
3798 Valid = false;
3799 NumNodeOrderIssues++;
3800 LLVM_DEBUG(dbgs() << "Predecessor ");
3801 }
3802 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3803 << " are scheduled before node " << SU->NodeNum
3804 << "\n");
3805 }
3806 }
3807
3808 LLVM_DEBUG({
3809 if (!Valid)
3810 dbgs() << "Invalid node order found!\n";
3811 });
3812}
3813
3814/// Attempt to fix the degenerate cases when the instruction serialization
3815/// causes the register lifetimes to overlap. For example,
3816/// p' = store_pi(p, b)
3817/// = load p, offset
3818/// In this case p and p' overlap, which means that two registers are needed.
3819/// Instead, this function changes the load to use p' and updates the offset.
3820void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3821 Register OverlapReg;
3822 Register NewBaseReg;
3823 for (SUnit *SU : Instrs) {
3824 MachineInstr *MI = SU->getInstr();
3825 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3826 const MachineOperand &MO = MI->getOperand(i);
3827 // Look for an instruction that uses p. The instruction occurs in the
3828 // same cycle but occurs later in the serialized order.
3829 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3830 // Check that the instruction appears in the InstrChanges structure,
3831 // which contains instructions that can have the offset updated.
3833 InstrChanges.find(SU);
3834 if (It != InstrChanges.end()) {
3835 unsigned BasePos, OffsetPos;
3836 // Update the base register and adjust the offset.
3837 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3838 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3839 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3840 int64_t NewOffset =
3841 MI->getOperand(OffsetPos).getImm() - It->second.second;
3842 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3843 SU->setInstr(NewMI);
3844 MISUnitMap[NewMI] = SU;
3845 NewMIs[MI] = NewMI;
3846 }
3847 }
3848 OverlapReg = Register();
3849 NewBaseReg = Register();
3850 break;
3851 }
3852 // Look for an instruction of the form p' = op(p), which uses and defines
3853 // two virtual registers that get allocated to the same physical register.
3854 unsigned TiedUseIdx = 0;
3855 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3856 // OverlapReg is p in the example above.
3857 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3858 // NewBaseReg is p' in the example above.
3859 NewBaseReg = MI->getOperand(i).getReg();
3860 break;
3861 }
3862 }
3863 }
3864}
3865
3866std::deque<SUnit *>
3868 const std::deque<SUnit *> &Instrs) const {
3869 std::deque<SUnit *> NewOrderPhi;
3870 for (SUnit *SU : Instrs) {
3871 if (SU->getInstr()->isPHI())
3872 NewOrderPhi.push_back(SU);
3873 }
3874 std::deque<SUnit *> NewOrderI;
3875 for (SUnit *SU : Instrs) {
3876 if (!SU->getInstr()->isPHI())
3877 orderDependence(SSD, SU, NewOrderI);
3878 }
3879 llvm::append_range(NewOrderPhi, NewOrderI);
3880 return NewOrderPhi;
3881}
3882
3883/// After the schedule has been formed, call this function to combine
3884/// the instructions from the different stages/cycles. That is, this
3885/// function creates a schedule that represents a single iteration.
3887 // Move all instructions to the first stage from later stages.
3888 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3889 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3890 ++stage) {
3891 std::deque<SUnit *> &cycleInstrs =
3892 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3893 for (SUnit *SU : llvm::reverse(cycleInstrs))
3894 ScheduledInstrs[cycle].push_front(SU);
3895 }
3896 }
3897
3898 // Erase all the elements in the later stages. Only one iteration should
3899 // remain in the scheduled list, and it contains all the instructions.
3900 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3901 ScheduledInstrs.erase(cycle);
3902
3903 // Change the registers in instruction as specified in the InstrChanges
3904 // map. We need to use the new registers to create the correct order.
3905 for (const SUnit &SU : SSD->SUnits)
3906 SSD->applyInstrChange(SU.getInstr(), *this);
3907
3908 // Reorder the instructions in each cycle to fix and improve the
3909 // generated code.
3910 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3911 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3912 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3913 SSD->fixupRegisterOverlaps(cycleInstrs);
3914 }
3915
3916 LLVM_DEBUG(dump(););
3917}
3918
3920 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3921 << " depth " << MaxDepth << " col " << Colocate << "\n";
3922 for (const auto &I : Nodes)
3923 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3924 os << "\n";
3925}
3926
3927#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3928/// Print the schedule information to the given output.
3930 // Iterate over each cycle.
3931 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3932 // Iterate over each instruction in the cycle.
3933 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3934 for (SUnit *CI : cycleInstrs->second) {
3935 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3936 os << "(" << CI->NodeNum << ") ";
3937 CI->getInstr()->print(os);
3938 os << "\n";
3939 }
3940 }
3941}
3942
3943/// Utility function used for debugging to print the schedule.
3946
3947void ResourceManager::dumpMRT() const {
3948 LLVM_DEBUG({
3949 if (UseDFA)
3950 return;
3951 std::stringstream SS;
3952 SS << "MRT:\n";
3953 SS << std::setw(4) << "Slot";
3954 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3955 SS << std::setw(3) << I;
3956 SS << std::setw(7) << "#Mops"
3957 << "\n";
3958 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3959 SS << std::setw(4) << Slot;
3960 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3961 SS << std::setw(3) << MRT[Slot][I];
3962 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3963 }
3964 dbgs() << SS.str();
3965 });
3966}
3967#endif
3968
3970 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3971 unsigned ProcResourceID = 0;
3972
3973 // We currently limit the resource kinds to 64 and below so that we can use
3974 // uint64_t for Masks
3975 assert(SM.getNumProcResourceKinds() < 64 &&
3976 "Too many kinds of resources, unsupported");
3977 // Create a unique bitmask for every processor resource unit.
3978 // Skip resource at index 0, since it always references 'InvalidUnit'.
3979 Masks.resize(SM.getNumProcResourceKinds());
3980 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3981 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3982 if (Desc.SubUnitsIdxBegin)
3983 continue;
3984 Masks[I] = 1ULL << ProcResourceID;
3985 ProcResourceID++;
3986 }
3987 // Create a unique bitmask for every processor resource group.
3988 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3989 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3990 if (!Desc.SubUnitsIdxBegin)
3991 continue;
3992 Masks[I] = 1ULL << ProcResourceID;
3993 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3994 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3995 ProcResourceID++;
3996 }
3997 LLVM_DEBUG({
3998 if (SwpShowResMask) {
3999 dbgs() << "ProcResourceDesc:\n";
4000 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4001 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
4002 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
4003 ProcResource->Name, I, Masks[I],
4004 ProcResource->NumUnits);
4005 }
4006 dbgs() << " -----------------\n";
4007 }
4008 });
4009}
4010
4012 LLVM_DEBUG({
4013 if (SwpDebugResource)
4014 dbgs() << "canReserveResources:\n";
4015 });
4016 if (UseDFA)
4017 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4018 ->canReserveResources(&SU.getInstr()->getDesc());
4019
4020 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4021 if (!SCDesc->isValid()) {
4022 LLVM_DEBUG({
4023 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4024 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4025 });
4026 return true;
4027 }
4028
4029 reserveResources(SCDesc, Cycle);
4030 bool Result = !isOverbooked();
4031 unreserveResources(SCDesc, Cycle);
4032
4033 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
4034 return Result;
4035}
4036
4037void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
4038 LLVM_DEBUG({
4039 if (SwpDebugResource)
4040 dbgs() << "reserveResources:\n";
4041 });
4042 if (UseDFA)
4043 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4044 ->reserveResources(&SU.getInstr()->getDesc());
4045
4046 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4047 if (!SCDesc->isValid()) {
4048 LLVM_DEBUG({
4049 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4050 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4051 });
4052 return;
4053 }
4054
4055 reserveResources(SCDesc, Cycle);
4056
4057 LLVM_DEBUG({
4058 if (SwpDebugResource) {
4059 dumpMRT();
4060 dbgs() << "reserveResources: done!\n\n";
4061 }
4062 });
4063}
4064
4065void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
4066 int Cycle) {
4067 assert(!UseDFA);
4068 for (const MCWriteProcResEntry &PRE : make_range(
4069 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4070 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4071 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4072
4073 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4074 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
4075}
4076
4077void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
4078 int Cycle) {
4079 assert(!UseDFA);
4080 for (const MCWriteProcResEntry &PRE : make_range(
4081 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4082 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4083 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4084
4085 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4086 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
4087}
4088
4089bool ResourceManager::isOverbooked() const {
4090 assert(!UseDFA);
4091 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
4092 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4093 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4094 if (MRT[Slot][I] > Desc->NumUnits)
4095 return true;
4096 }
4097 if (NumScheduledMops[Slot] > IssueWidth)
4098 return true;
4099 }
4100 return false;
4101}
4102
4103int ResourceManager::calculateResMIIDFA() const {
4104 assert(UseDFA);
4105
4106 // Sort the instructions by the number of available choices for scheduling,
4107 // least to most. Use the number of critical resources as the tie breaker.
4108 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4109 for (SUnit &SU : DAG->SUnits)
4110 FUS.calcCriticalResources(*SU.getInstr());
4111 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4112 FuncUnitOrder(FUS);
4113
4114 for (SUnit &SU : DAG->SUnits)
4115 FuncUnitOrder.push(SU.getInstr());
4116
4118 Resources.push_back(
4119 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4120
4121 while (!FuncUnitOrder.empty()) {
4122 MachineInstr *MI = FuncUnitOrder.top();
4123 FuncUnitOrder.pop();
4124 if (TII->isZeroCost(MI->getOpcode()))
4125 continue;
4126
4127 // Attempt to reserve the instruction in an existing DFA. At least one
4128 // DFA is needed for each cycle.
4129 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4130 unsigned ReservedCycles = 0;
4131 auto *RI = Resources.begin();
4132 auto *RE = Resources.end();
4133 LLVM_DEBUG({
4134 dbgs() << "Trying to reserve resource for " << NumCycles
4135 << " cycles for \n";
4136 MI->dump();
4137 });
4138 for (unsigned C = 0; C < NumCycles; ++C)
4139 while (RI != RE) {
4140 if ((*RI)->canReserveResources(*MI)) {
4141 (*RI)->reserveResources(*MI);
4142 ++ReservedCycles;
4143 break;
4144 }
4145 RI++;
4146 }
4147 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4148 << ", NumCycles:" << NumCycles << "\n");
4149 // Add new DFAs, if needed, to reserve resources.
4150 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4152 << "NewResource created to reserve resources"
4153 << "\n");
4154 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4155 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4156 NewResource->reserveResources(*MI);
4157 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4158 }
4159 }
4160
4161 int Resmii = Resources.size();
4162 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4163 return Resmii;
4164}
4165
4167 if (UseDFA)
4168 return calculateResMIIDFA();
4169
4170 // Count each resource consumption and divide it by the number of units.
4171 // ResMII is the max value among them.
4172
4173 int NumMops = 0;
4174 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4175 for (SUnit &SU : DAG->SUnits) {
4176 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4177 continue;
4178
4179 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4180 if (!SCDesc->isValid())
4181 continue;
4182
4183 LLVM_DEBUG({
4184 if (SwpDebugResource) {
4185 DAG->dumpNode(SU);
4186 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4187 << " WriteProcRes: ";
4188 }
4189 });
4190 NumMops += SCDesc->NumMicroOps;
4191 for (const MCWriteProcResEntry &PRE :
4192 make_range(STI->getWriteProcResBegin(SCDesc),
4193 STI->getWriteProcResEnd(SCDesc))) {
4194 LLVM_DEBUG({
4195 if (SwpDebugResource) {
4196 const MCProcResourceDesc *Desc =
4197 SM.getProcResource(PRE.ProcResourceIdx);
4198 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4199 }
4200 });
4201 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4202 }
4203 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4204 }
4205
4206 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4207 LLVM_DEBUG({
4208 if (SwpDebugResource)
4209 dbgs() << "#Mops: " << NumMops << ", "
4210 << "IssueWidth: " << IssueWidth << ", "
4211 << "Cycles: " << Result << "\n";
4212 });
4213
4214 LLVM_DEBUG({
4215 if (SwpDebugResource) {
4216 std::stringstream SS;
4217 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4218 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4219 << "\n";
4220 dbgs() << SS.str();
4221 }
4222 });
4223 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4224 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4225 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4226 LLVM_DEBUG({
4227 if (SwpDebugResource) {
4228 std::stringstream SS;
4229 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4230 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4231 << std::setw(10) << Cycles << "\n";
4232 dbgs() << SS.str();
4233 }
4234 });
4235 if (Cycles > Result)
4236 Result = Cycles;
4237 }
4238 return Result;
4239}
4240
4242 InitiationInterval = II;
4243 DFAResources.clear();
4244 DFAResources.resize(II);
4245 for (auto &I : DFAResources)
4246 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4247 MRT.clear();
4248 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4249 NumScheduledMops.clear();
4250 NumScheduledMops.resize(II);
4251}
4252
4253bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4254 if (Pred.isArtificial() || Dst->isBoundaryNode())
4255 return true;
4256 // Currently, dependence that is an anti-dependences but not a loop-carried is
4257 // also ignored. This behavior is preserved to prevent regression.
4258 // FIXME: Remove if this doesn't have significant impact on performance
4259 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4260}
4261
4262SwingSchedulerDDG::SwingSchedulerDDGEdges &
4263SwingSchedulerDDG::getEdges(const SUnit *SU) {
4264 if (SU == EntrySU)
4265 return EntrySUEdges;
4266 if (SU == ExitSU)
4267 return ExitSUEdges;
4268 return EdgesVec[SU->NodeNum];
4269}
4270
4271const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4272SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4273 if (SU == EntrySU)
4274 return EntrySUEdges;
4275 if (SU == ExitSU)
4276 return ExitSUEdges;
4277 return EdgesVec[SU->NodeNum];
4278}
4279
4280void SwingSchedulerDDG::addEdge(const SUnit *SU,
4281 const SwingSchedulerDDGEdge &Edge) {
4282 assert(!Edge.isValidationOnly() &&
4283 "Validation-only edges are not expected here.");
4284 auto &Edges = getEdges(SU);
4285 if (Edge.getSrc() == SU)
4286 Edges.Succs.push_back(Edge);
4287 else
4288 Edges.Preds.push_back(Edge);
4289}
4290
4291void SwingSchedulerDDG::initEdges(SUnit *SU) {
4292 for (const auto &PI : SU->Preds) {
4293 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4294 /*IsValidationOnly=*/false);
4295 addEdge(SU, Edge);
4296 }
4297
4298 for (const auto &SI : SU->Succs) {
4299 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4300 /*IsValidationOnly=*/false);
4301 addEdge(SU, Edge);
4302 }
4303}
4304
4305SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4306 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4307 : EntrySU(EntrySU), ExitSU(ExitSU) {
4308 EdgesVec.resize(SUnits.size());
4309
4310 // Add non-loop-carried edges based on the DAG.
4311 initEdges(EntrySU);
4312 initEdges(ExitSU);
4313 for (auto &SU : SUnits)
4314 initEdges(&SU);
4315
4316 // Add loop-carried edges, which are not represented in the DAG.
4317 for (SUnit &SU : SUnits) {
4318 SUnit *Src = &SU;
4319 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4320 SDep Base(Src, SDep::Barrier);
4321 Base.setLatency(1);
4322 for (SUnit *Dst : *OD) {
4323 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4324 /*IsValidationOnly=*/true);
4325 Edge.setDistance(1);
4326 ValidationOnlyEdges.push_back(Edge);
4327 }
4328 }
4329 }
4330}
4331
4332const SwingSchedulerDDG::EdgesType &
4334 return getEdges(SU).Preds;
4335}
4336
4337const SwingSchedulerDDG::EdgesType &
4339 return getEdges(SU).Succs;
4340}
4341
4342/// Check if \p Schedule doesn't violate the validation-only dependencies.
4344 unsigned II = Schedule.getInitiationInterval();
4345
4346 auto ExpandCycle = [&](SUnit *SU) {
4347 int Stage = Schedule.stageScheduled(SU);
4348 int Cycle = Schedule.cycleScheduled(SU);
4349 return Cycle + (Stage * II);
4350 };
4351
4352 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4353 SUnit *Src = Edge.getSrc();
4354 SUnit *Dst = Edge.getDst();
4355 if (!Src->isInstr() || !Dst->isInstr())
4356 continue;
4357 int CycleSrc = ExpandCycle(Src);
4358 int CycleDst = ExpandCycle(Dst);
4359 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4360 if (CycleSrc > MaxLateStart) {
4361 LLVM_DEBUG({
4362 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4363 << Dst->NodeNum << "\n";
4364 });
4365 return false;
4366 }
4367 }
4368 return true;
4369}
4370
4371void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4372 const TargetInstrInfo *TII) {
4373 for (SUnit &SU : SUnits) {
4374 SUnit *Src = &SU;
4375 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4376 SDep Dep(Src, SDep::Barrier);
4377 Dep.setLatency(1);
4378 for (SUnit *Dst : *OrderDep) {
4379 SUnit *From = Src;
4380 SUnit *To = Dst;
4381 if (From->NodeNum > To->NodeNum)
4382 std::swap(From, To);
4383
4384 // Add a forward edge if the following conditions are met:
4385 //
4386 // - The instruction of the source node (FromMI) may read memory.
4387 // - The instruction of the target node (ToMI) may modify memory, but
4388 // does not read it.
4389 // - Neither instruction is a global barrier.
4390 // - The load appears before the store in the original basic block.
4391 // - There are no barrier or store instructions between the two nodes.
4392 // - The target node is unreachable from the source node in the current
4393 // DAG.
4394 //
4395 // TODO: These conditions are inherited from a previous implementation,
4396 // and some may no longer be necessary. For now, we conservatively
4397 // retain all of them to avoid regressions, but the logic could
4398 // potentially be simplified
4399 MachineInstr *FromMI = From->getInstr();
4400 MachineInstr *ToMI = To->getInstr();
4401 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4402 !TII->isGlobalMemoryObject(FromMI) &&
4403 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4404 SDep Pred = Dep;
4405 Pred.setSUnit(From);
4406 To->addPred(Pred);
4407 }
4408 }
4409 }
4410 }
4411}
4412
4414 const MachineRegisterInfo *MRI) const {
4415 const auto *Order = getOrderDepOrNull(SU);
4416
4417 if (!Order)
4418 return;
4419
4420 const auto DumpSU = [](const SUnit *SU) {
4421 std::ostringstream OSS;
4422 OSS << "SU(" << SU->NodeNum << ")";
4423 return OSS.str();
4424 };
4425
4426 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4427 << " Order\n";
4428 for (SUnit *Dst : *Order)
4429 dbgs() << " " << DumpSU(Dst) << "\n";
4430}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file defines the DenseMap class.
#define DEBUG_TYPE
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:593
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool hasPHICycleDFS(unsigned Reg, const DenseMap< unsigned, SmallVector< unsigned, 2 > > &PhiDeps, SmallSet< unsigned, 8 > &Visited, SmallSet< unsigned, 8 > &RecStack)
Depth-first search to detect cycles among PHI dependencies.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool hasPHICycle(const MachineBasicBlock *LoopHeader, const MachineRegisterInfo &MRI)
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:618
MachineInstrBundleIterator< const MachineInstr > const_iterator
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
typename vector_type::const_iterator iterator
Definition SetVector.h:72
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr MCRegUnit asMCRegUnit() const
Definition Register.h:201
constexpr Register asVirtualReg() const
Definition Register.h:206
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ Valid
The data is already valid.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2070
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2140
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2006
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.