LLVM 22.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197/// A command line argument to limit the number of store instructions in the
198/// target basic block.
200 "pipeliner-max-num-stores",
201 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
202 cl::init(200));
203
204// A command line option to enable the CopyToPhi DAG mutation.
206 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
207 cl::init(true),
208 cl::desc("Enable CopyToPhi DAG Mutation"));
209
210/// A command line argument to force pipeliner to use specified issue
211/// width.
213 "pipeliner-force-issue-width",
214 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
215 cl::init(-1));
216
217/// A command line argument to set the window scheduling option.
220 cl::desc("Set how to use window scheduling algorithm."),
222 "Turn off window algorithm."),
224 "Use window algorithm after SMS algorithm fails."),
226 "Use window algorithm instead of SMS algorithm.")));
227
228unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
229char MachinePipeliner::ID = 0;
230#ifndef NDEBUG
232#endif
234
236 "Modulo Software Pipelining", false, false)
242 "Modulo Software Pipelining", false, false)
243
244namespace {
245
246/// This class holds an SUnit corresponding to a memory operation and other
247/// information related to the instruction.
251
252 /// The value of a memory operand.
253 const Value *MemOpValue = nullptr;
254
255 /// The offset of a memory operand.
256 int64_t MemOpOffset = 0;
257
259
260 /// True if all the underlying objects are identified.
261 bool IsAllIdentified = false;
262
264
265 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
266
267 bool isUnknown() const { return MemOpValue == nullptr; }
268
269private:
271};
272
273/// Add loop-carried chain dependencies. This class handles the same type of
274/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
275/// account dependencies across iterations.
277 // Type of instruction that is relevant to order-dependencies
278 enum class InstrTag {
279 Barrier = 0, ///< A barrier event instruction.
280 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
281 ///< not a barrier event.
282 FPExceptions = 2, ///< An instruction that does not match above, but may
283 ///< raise floatin-point exceptions.
284 };
285
286 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
287 TaggedSUnit(SUnit *SU, InstrTag Tag)
288 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
289
290 InstrTag getTag() const { return InstrTag(getInt()); }
291 };
292
293 /// Holds loads and stores with memory related information.
294 struct LoadStoreChunk {
297
298 void append(SUnit *SU);
299 };
300
302 BatchAAResults *BAA;
303 std::vector<SUnit> &SUnits;
304
305 /// The size of SUnits, for convenience.
306 const unsigned N;
307
308 /// Loop-carried Edges.
309 std::vector<BitVector> LoopCarried;
310
311 /// Instructions related to chain dependencies. They are one of the
312 /// following:
313 ///
314 /// 1. Barrier event.
315 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
316 /// value.
317 /// 3. Store, but not a barrier event.
318 /// 4. None of them, but may raise floating-point exceptions.
319 ///
320 /// This is used when analyzing loop-carried dependencies that access global
321 /// barrier instructions.
322 std::vector<TaggedSUnit> TaggedSUnits;
323
324 const TargetInstrInfo *TII = nullptr;
325 const TargetRegisterInfo *TRI = nullptr;
326
327public:
329 const TargetInstrInfo *TII,
330 const TargetRegisterInfo *TRI);
331
332 /// The main function to compute loop-carried order-dependencies.
333 void computeDependencies();
334
335 const BitVector &getLoopCarried(unsigned Idx) const {
336 return LoopCarried[Idx];
337 }
338
339private:
340 /// Tags to \p SU if the instruction may affect the order-dependencies.
341 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
342
343 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
344 const LoadStoreChunk &To);
345
346 /// Add a loop-carried order dependency between \p Src and \p Dst if we
347 /// cannot prove they are independent. When \p PerformCheapCheck is true, a
348 /// lightweight dependency test (referred to as "cheap check" below) is
349 /// performed at first. Note that the cheap check is retained to maintain the
350 /// existing behavior and not expected to be used anymore.
351 ///
352 /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
353 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
354 const SUnitWithMemInfo &Dst,
355 bool PerformCheapCheck = false);
356
357 void computeDependenciesAux();
358};
359
360} // end anonymous namespace
361
362/// The "main" function for implementing Swing Modulo Scheduling.
364 if (skipFunction(mf.getFunction()))
365 return false;
366
367 if (!EnableSWP)
368 return false;
369
370 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
371 !EnableSWPOptSize.getPosition())
372 return false;
373
375 return false;
376
377 // Cannot pipeline loops without instruction itineraries if we are using
378 // DFA for the pipeliner.
379 if (mf.getSubtarget().useDFAforSMS() &&
382 return false;
383
384 MF = &mf;
388 TII = MF->getSubtarget().getInstrInfo();
389 RegClassInfo.runOnMachineFunction(*MF);
390
391 for (const auto &L : *MLI)
392 scheduleLoop(*L);
393
394 return false;
395}
396
397/// Attempt to perform the SMS algorithm on the specified loop. This function is
398/// the main entry point for the algorithm. The function identifies candidate
399/// loops, calculates the minimum initiation interval, and attempts to schedule
400/// the loop.
401bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
402 bool Changed = false;
403 for (const auto &InnerLoop : L)
404 Changed |= scheduleLoop(*InnerLoop);
405
406#ifndef NDEBUG
407 // Stop trying after reaching the limit (if any).
408 int Limit = SwpLoopLimit;
409 if (Limit >= 0) {
410 if (NumTries >= SwpLoopLimit)
411 return Changed;
412 NumTries++;
413 }
414#endif
415
416 setPragmaPipelineOptions(L);
417 if (!canPipelineLoop(L)) {
418 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
419 ORE->emit([&]() {
420 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
421 L.getStartLoc(), L.getHeader())
422 << "Failed to pipeline loop";
423 });
424
425 LI.LoopPipelinerInfo.reset();
426 return Changed;
427 }
428
429 ++NumTrytoPipeline;
430 if (useSwingModuloScheduler())
431 Changed = swingModuloScheduler(L);
432
433 if (useWindowScheduler(Changed))
434 Changed = runWindowScheduler(L);
435
436 LI.LoopPipelinerInfo.reset();
437 return Changed;
438}
439
440void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
441 // Reset the pragma for the next loop in iteration.
442 disabledByPragma = false;
443 II_setByPragma = 0;
444
445 MachineBasicBlock *LBLK = L.getTopBlock();
446
447 if (LBLK == nullptr)
448 return;
449
450 const BasicBlock *BBLK = LBLK->getBasicBlock();
451 if (BBLK == nullptr)
452 return;
453
454 const Instruction *TI = BBLK->getTerminator();
455 if (TI == nullptr)
456 return;
457
458 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
459 if (LoopID == nullptr)
460 return;
461
462 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
463 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
464
465 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
466 MDNode *MD = dyn_cast<MDNode>(MDO);
467
468 if (MD == nullptr)
469 continue;
470
471 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
472
473 if (S == nullptr)
474 continue;
475
476 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
477 assert(MD->getNumOperands() == 2 &&
478 "Pipeline initiation interval hint metadata should have two operands.");
480 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
481 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
482 } else if (S->getString() == "llvm.loop.pipeline.disable") {
483 disabledByPragma = true;
484 }
485 }
486}
487
488/// Return true if the loop can be software pipelined. The algorithm is
489/// restricted to loops with a single basic block. Make sure that the
490/// branch in the loop can be analyzed.
491bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
492 if (L.getNumBlocks() != 1) {
493 ORE->emit([&]() {
494 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
495 L.getStartLoc(), L.getHeader())
496 << "Not a single basic block: "
497 << ore::NV("NumBlocks", L.getNumBlocks());
498 });
499 return false;
500 }
501
502 if (disabledByPragma) {
503 ORE->emit([&]() {
504 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
505 L.getStartLoc(), L.getHeader())
506 << "Disabled by Pragma.";
507 });
508 return false;
509 }
510
511 // Check if the branch can't be understood because we can't do pipelining
512 // if that's the case.
513 LI.TBB = nullptr;
514 LI.FBB = nullptr;
515 LI.BrCond.clear();
516 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
517 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
518 NumFailBranch++;
519 ORE->emit([&]() {
520 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
521 L.getStartLoc(), L.getHeader())
522 << "The branch can't be understood";
523 });
524 return false;
525 }
526
527 LI.LoopInductionVar = nullptr;
528 LI.LoopCompare = nullptr;
529 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
530 if (!LI.LoopPipelinerInfo) {
531 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
532 NumFailLoop++;
533 ORE->emit([&]() {
534 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
535 L.getStartLoc(), L.getHeader())
536 << "The loop structure is not supported";
537 });
538 return false;
539 }
540
541 if (!L.getLoopPreheader()) {
542 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
543 NumFailPreheader++;
544 ORE->emit([&]() {
545 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
546 L.getStartLoc(), L.getHeader())
547 << "No loop preheader found";
548 });
549 return false;
550 }
551
552 unsigned NumStores = 0;
553 for (MachineInstr &MI : *L.getHeader())
554 if (MI.mayStore())
555 ++NumStores;
556 if (NumStores > SwpMaxNumStores) {
557 LLVM_DEBUG(dbgs() << "Too many stores\n");
558 NumFailTooManyStores++;
559 ORE->emit([&]() {
560 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
561 L.getStartLoc(), L.getHeader())
562 << "Too many store instructions in the loop: "
563 << ore::NV("NumStores", NumStores) << " > "
564 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
565 });
566 return false;
567 }
568
569 // Remove any subregisters from inputs to phi nodes.
570 preprocessPhiNodes(*L.getHeader());
571 return true;
572}
573
574void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
575 MachineRegisterInfo &MRI = MF->getRegInfo();
576 SlotIndexes &Slots =
577 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
578
579 for (MachineInstr &PI : B.phis()) {
580 MachineOperand &DefOp = PI.getOperand(0);
581 assert(DefOp.getSubReg() == 0);
582 auto *RC = MRI.getRegClass(DefOp.getReg());
583
584 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
585 MachineOperand &RegOp = PI.getOperand(i);
586 if (RegOp.getSubReg() == 0)
587 continue;
588
589 // If the operand uses a subregister, replace it with a new register
590 // without subregisters, and generate a copy to the new register.
591 Register NewReg = MRI.createVirtualRegister(RC);
592 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
594 const DebugLoc &DL = PredB.findDebugLoc(At);
595 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
596 .addReg(RegOp.getReg(), getRegState(RegOp),
597 RegOp.getSubReg());
598 Slots.insertMachineInstrInMaps(*Copy);
599 RegOp.setReg(NewReg);
600 RegOp.setSubReg(0);
601 }
602 }
603}
604
605/// The SMS algorithm consists of the following main steps:
606/// 1. Computation and analysis of the dependence graph.
607/// 2. Ordering of the nodes (instructions).
608/// 3. Attempt to Schedule the loop.
609bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
610 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
611
612 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
613 SwingSchedulerDAG SMS(
615 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
616
617 MachineBasicBlock *MBB = L.getHeader();
618 // The kernel should not include any terminator instructions. These
619 // will be added back later.
620 SMS.startBlock(MBB);
621
622 // Compute the number of 'real' instructions in the basic block by
623 // ignoring terminators.
624 unsigned size = MBB->size();
626 E = MBB->instr_end();
627 I != E; ++I, --size)
628 ;
629
630 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
631 SMS.schedule();
632 SMS.exitRegion();
633
634 SMS.finishBlock();
635 return SMS.hasNewSchedule();
636}
637
648
649bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
650 MachineSchedContext Context;
651 Context.MF = MF;
652 Context.MLI = MLI;
653 Context.MDT = MDT;
654 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
655 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
656 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
657 Context.RegClassInfo->runOnMachineFunction(*MF);
658 WindowScheduler WS(&Context, L);
659 return WS.run();
660}
661
662bool MachinePipeliner::useSwingModuloScheduler() {
663 // SwingModuloScheduler does not work when WindowScheduler is forced.
665}
666
667bool MachinePipeliner::useWindowScheduler(bool Changed) {
668 // WindowScheduler does not work for following cases:
669 // 1. when it is off.
670 // 2. when SwingModuloScheduler is successfully scheduled.
671 // 3. when pragma II is enabled.
672 if (II_setByPragma) {
673 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
674 "llvm.loop.pipeline.initiationinterval is set.\n");
675 return false;
676 }
677
680}
681
682void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
683 if (SwpForceII > 0)
684 MII = SwpForceII;
685 else if (II_setByPragma > 0)
686 MII = II_setByPragma;
687 else
688 MII = std::max(ResMII, RecMII);
689}
690
691void SwingSchedulerDAG::setMAX_II() {
692 if (SwpForceII > 0)
693 MAX_II = SwpForceII;
694 else if (II_setByPragma > 0)
695 MAX_II = II_setByPragma;
696 else
697 MAX_II = MII + SwpIISearchRange;
698}
699
700/// We override the schedule function in ScheduleDAGInstrs to implement the
701/// scheduling part of the Swing Modulo Scheduling algorithm.
703 buildSchedGraph(AA);
704 const LoopCarriedEdges LCE = addLoopCarriedDependences();
705 updatePhiDependences();
706 Topo.InitDAGTopologicalSorting();
707 changeDependences();
708 postProcessDAG();
709 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
710 LLVM_DEBUG({
711 dump();
712 dbgs() << "===== Loop Carried Edges Begin =====\n";
713 for (SUnit &SU : SUnits)
714 LCE.dump(&SU, TRI, &MRI);
715 dbgs() << "===== Loop Carried Edges End =====\n";
716 });
717
718 NodeSetType NodeSets;
719 findCircuits(NodeSets);
720 NodeSetType Circuits = NodeSets;
721
722 // Calculate the MII.
723 unsigned ResMII = calculateResMII();
724 unsigned RecMII = calculateRecMII(NodeSets);
725
726 fuseRecs(NodeSets);
727
728 // This flag is used for testing and can cause correctness problems.
729 if (SwpIgnoreRecMII)
730 RecMII = 0;
731
732 setMII(ResMII, RecMII);
733 setMAX_II();
734
735 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
736 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
737
738 // Can't schedule a loop without a valid MII.
739 if (MII == 0) {
740 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
741 NumFailZeroMII++;
742 Pass.ORE->emit([&]() {
744 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
745 << "Invalid Minimal Initiation Interval: 0";
746 });
747 return;
748 }
749
750 // Don't pipeline large loops.
751 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
752 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
753 << ", we don't pipeline large loops\n");
754 NumFailLargeMaxMII++;
755 Pass.ORE->emit([&]() {
757 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
758 << "Minimal Initiation Interval too large: "
759 << ore::NV("MII", (int)MII) << " > "
760 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
761 << "Refer to -pipeliner-max-mii.";
762 });
763 return;
764 }
765
766 computeNodeFunctions(NodeSets);
767
768 registerPressureFilter(NodeSets);
769
770 colocateNodeSets(NodeSets);
771
772 checkNodeSets(NodeSets);
773
774 LLVM_DEBUG({
775 for (auto &I : NodeSets) {
776 dbgs() << " Rec NodeSet ";
777 I.dump();
778 }
779 });
780
781 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
782
783 groupRemainingNodes(NodeSets);
784
785 removeDuplicateNodes(NodeSets);
786
787 LLVM_DEBUG({
788 for (auto &I : NodeSets) {
789 dbgs() << " NodeSet ";
790 I.dump();
791 }
792 });
793
794 computeNodeOrder(NodeSets);
795
796 // check for node order issues
797 checkValidNodeOrder(Circuits);
798
799 SMSchedule Schedule(Pass.MF, this);
800 Scheduled = schedulePipeline(Schedule);
801
802 if (!Scheduled){
803 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
804 NumFailNoSchedule++;
805 Pass.ORE->emit([&]() {
807 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
808 << "Unable to find schedule";
809 });
810 return;
811 }
812
813 unsigned numStages = Schedule.getMaxStageCount();
814 // No need to generate pipeline if there are no overlapped iterations.
815 if (numStages == 0) {
816 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
817 NumFailZeroStage++;
818 Pass.ORE->emit([&]() {
820 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
821 << "No need to pipeline - no overlapped iterations in schedule.";
822 });
823 return;
824 }
825 // Check that the maximum stage count is less than user-defined limit.
826 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
827 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
828 << " : too many stages, abort\n");
829 NumFailLargeMaxStage++;
830 Pass.ORE->emit([&]() {
832 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
833 << "Too many stages in schedule: "
834 << ore::NV("numStages", (int)numStages) << " > "
835 << ore::NV("SwpMaxStages", SwpMaxStages)
836 << ". Refer to -pipeliner-max-stages.";
837 });
838 return;
839 }
840
841 Pass.ORE->emit([&]() {
842 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
843 Loop.getHeader())
844 << "Pipelined succesfully!";
845 });
846
847 // Generate the schedule as a ModuloSchedule.
848 DenseMap<MachineInstr *, int> Cycles, Stages;
849 std::vector<MachineInstr *> OrderedInsts;
850 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
851 ++Cycle) {
852 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
853 OrderedInsts.push_back(SU->getInstr());
854 Cycles[SU->getInstr()] = Cycle;
855 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
856 }
857 }
859 for (auto &KV : NewMIs) {
860 Cycles[KV.first] = Cycles[KV.second];
861 Stages[KV.first] = Stages[KV.second];
862 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
863 }
864
865 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
866 std::move(Stages));
868 assert(NewInstrChanges.empty() &&
869 "Cannot serialize a schedule with InstrChanges!");
871 MSTI.annotate();
872 return;
873 }
874 // The experimental code generator can't work if there are InstChanges.
875 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
876 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
877 MSE.expand();
878 } else if (MVECodeGen && NewInstrChanges.empty() &&
879 LoopPipelinerInfo->isMVEExpanderSupported() &&
881 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
882 MSE.expand();
883 } else {
884 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
885 MSE.expand();
886 MSE.cleanup();
887 }
888 ++NumPipelined;
889}
890
891/// Clean up after the software pipeliner runs.
893 for (auto &KV : NewMIs)
894 MF.deleteMachineInstr(KV.second);
895 NewMIs.clear();
896
897 // Call the superclass.
899}
900
901/// Return the register values for the operands of a Phi instruction.
902/// This function assume the instruction is a Phi.
904 Register &InitVal, Register &LoopVal) {
905 assert(Phi.isPHI() && "Expecting a Phi.");
906
907 InitVal = Register();
908 LoopVal = Register();
909 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
910 if (Phi.getOperand(i + 1).getMBB() != Loop)
911 InitVal = Phi.getOperand(i).getReg();
912 else
913 LoopVal = Phi.getOperand(i).getReg();
914
915 assert(InitVal && LoopVal && "Unexpected Phi structure.");
916}
917
918/// Return the Phi register value that comes the loop block.
920 const MachineBasicBlock *LoopBB) {
921 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
922 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
923 return Phi.getOperand(i).getReg();
924 return Register();
925}
926
927/// Return true if SUb can be reached from SUa following the chain edges.
928static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
931 Worklist.push_back(SUa);
932 while (!Worklist.empty()) {
933 const SUnit *SU = Worklist.pop_back_val();
934 for (const auto &SI : SU->Succs) {
935 SUnit *SuccSU = SI.getSUnit();
936 if (SI.getKind() == SDep::Order) {
937 if (Visited.count(SuccSU))
938 continue;
939 if (SuccSU == SUb)
940 return true;
941 Worklist.push_back(SuccSU);
942 Visited.insert(SuccSU);
943 }
944 }
945 }
946 return false;
947}
948
950 if (!getUnderlyingObjects())
951 return;
952 for (const Value *Obj : UnderlyingObjs)
953 if (!isIdentifiedObject(Obj)) {
954 IsAllIdentified = false;
955 break;
956 }
957}
958
960 const SUnitWithMemInfo &Other) const {
961 // If all underlying objects are identified objects and there is no overlap
962 // between them, then these two instructions are disjoint.
963 if (!IsAllIdentified || !Other.IsAllIdentified)
964 return false;
965 for (const Value *Obj : UnderlyingObjs)
966 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
967 return false;
968 return true;
969}
970
971/// Collect the underlying objects for the memory references of an instruction.
972/// This function calls the code in ValueTracking, but first checks that the
973/// instruction has a memory operand.
974/// Returns false if we cannot find the underlying objects.
975bool SUnitWithMemInfo::getUnderlyingObjects() {
976 const MachineInstr *MI = SU->getInstr();
977 if (!MI->hasOneMemOperand())
978 return false;
979 MachineMemOperand *MM = *MI->memoperands_begin();
980 if (!MM->getValue())
981 return false;
982 MemOpValue = MM->getValue();
983 MemOpOffset = MM->getOffset();
985
986 // TODO: A no alias scope may be valid only in a single iteration. In this
987 // case we need to peel off it like LoopAccessAnalysis does.
988 AATags = MM->getAAInfo();
989 return true;
990}
991
992/// Returns true if there is a loop-carried order dependency from \p Src to \p
993/// Dst.
994static bool
995hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
997 const TargetRegisterInfo *TRI,
998 const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
999 if (Src.isTriviallyDisjoint(Dst))
1000 return false;
1001 if (isSuccOrder(Src.SU, Dst.SU))
1002 return false;
1003
1004 MachineInstr &SrcMI = *Src.SU->getInstr();
1005 MachineInstr &DstMI = *Dst.SU->getInstr();
1006 if (PerformCheapCheck) {
1007 // First, perform the cheaper check that compares the base register.
1008 // If they are the same and the load offset is less than the store
1009 // offset, then mark the dependence as loop carried potentially.
1010 //
1011 // TODO: This check will be removed.
1012 const MachineOperand *BaseOp1, *BaseOp2;
1013 int64_t Offset1, Offset2;
1014 bool Offset1IsScalable, Offset2IsScalable;
1015 if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
1016 TRI) &&
1017 TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
1018 TRI)) {
1019 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
1020 Offset1IsScalable == Offset2IsScalable &&
1021 (int)Offset1 < (int)Offset2) {
1022 assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1023 "What happened to the chain edge?");
1024 return true;
1025 }
1026 }
1027 }
1028
1029 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1030 return false;
1031
1032 // Second, the more expensive check that uses alias analysis on the
1033 // base registers. If they alias, and the load offset is less than
1034 // the store offset, the mark the dependence as loop carried.
1035 if (Src.isUnknown() || Dst.isUnknown())
1036 return true;
1037 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1038 return true;
1039
1040 if (BAA.isNoAlias(
1041 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1042 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1043 return false;
1044
1045 // AliasAnalysis sometimes gives up on following the underlying
1046 // object. In such a case, separate checks for underlying objects may
1047 // prove that there are no aliases between two accesses.
1048 for (const Value *SrcObj : Src.UnderlyingObjs)
1049 for (const Value *DstObj : Dst.UnderlyingObjs)
1050 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1051 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1052 return true;
1053
1054 return false;
1055}
1056
1057void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1058 const MachineInstr *MI = SU->getInstr();
1059 if (!MI->mayLoadOrStore())
1060 return;
1061 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1062}
1063
1065 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1066 const TargetRegisterInfo *TRI)
1067 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1068 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1069
1071 // Traverse all instructions and extract only what we are targetting.
1072 for (auto &SU : SUnits) {
1073 auto Tagged = getInstrTag(&SU);
1074
1075 // This instruction has no loop-carried order-dependencies.
1076 if (!Tagged)
1077 continue;
1078 TaggedSUnits.emplace_back(&SU, *Tagged);
1079 }
1080
1081 computeDependenciesAux();
1082}
1083
1084std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1085LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1086 MachineInstr *MI = SU->getInstr();
1087 if (TII->isGlobalMemoryObject(MI))
1088 return InstrTag::Barrier;
1089
1090 if (MI->mayStore() ||
1091 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1092 return InstrTag::LoadOrStore;
1093
1094 if (MI->mayRaiseFPException())
1095 return InstrTag::FPExceptions;
1096
1097 return std::nullopt;
1098}
1099
1100void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1101 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1102 bool PerformCheapCheck) {
1103 // Avoid self-dependencies.
1104 if (Src.SU == Dst.SU)
1105 return;
1106
1107 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1108 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1109}
1110
1111void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1112 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1113 // Add load-to-store dependencies (WAR).
1114 for (const SUnitWithMemInfo &Src : From.Loads)
1115 for (const SUnitWithMemInfo &Dst : To.Stores)
1116 // Perform a cheap check first if this is a forward dependency.
1117 addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
1118
1119 // Add store-to-load dependencies (RAW).
1120 for (const SUnitWithMemInfo &Src : From.Stores)
1121 for (const SUnitWithMemInfo &Dst : To.Loads)
1122 addDependenciesBetweenSUs(Src, Dst);
1123
1124 // Add store-to-store dependencies (WAW).
1125 for (const SUnitWithMemInfo &Src : From.Stores)
1126 for (const SUnitWithMemInfo &Dst : To.Stores)
1127 addDependenciesBetweenSUs(Src, Dst);
1128}
1129
1130void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1132 for (const auto &TSU : TaggedSUnits) {
1133 InstrTag Tag = TSU.getTag();
1134 SUnit *SU = TSU.getPointer();
1135 switch (Tag) {
1136 case InstrTag::Barrier:
1137 Chunks.emplace_back();
1138 break;
1139 case InstrTag::LoadOrStore:
1140 Chunks.back().append(SU);
1141 break;
1142 case InstrTag::FPExceptions:
1143 // TODO: Handle this properly.
1144 break;
1145 }
1146 }
1147
1148 // Add dependencies between memory operations. If there are one or more
1149 // barrier events between two memory instructions, we don't add a
1150 // loop-carried dependence for them.
1151 for (const LoadStoreChunk &Chunk : Chunks)
1152 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1153
1154 // TODO: If there are multiple barrier instructions, dependencies from the
1155 // last barrier instruction (or load/store below it) to the first barrier
1156 // instruction (or load/store above it).
1157}
1158
1159/// Add a chain edge between a load and store if the store can be an
1160/// alias of the load on a subsequent iteration, i.e., a loop carried
1161/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1162/// but that code doesn't create loop carried dependences.
1163/// TODO: Also compute output-dependencies.
1164LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1165 LoopCarriedEdges LCE;
1166
1167 // Add loop-carried order-dependencies
1168 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1169 LCODTracker.computeDependencies();
1170 for (unsigned I = 0; I != SUnits.size(); I++)
1171 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1172 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1173
1174 LCE.modifySUnits(SUnits, TII);
1175 return LCE;
1176}
1177
1178/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1179/// processes dependences for PHIs. This function adds true dependences
1180/// from a PHI to a use, and a loop carried dependence from the use to the
1181/// PHI. The loop carried dependence is represented as an anti dependence
1182/// edge. This function also removes chain dependences between unrelated
1183/// PHIs.
1184void SwingSchedulerDAG::updatePhiDependences() {
1185 SmallVector<SDep, 4> RemoveDeps;
1186 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1187
1188 // Iterate over each DAG node.
1189 for (SUnit &I : SUnits) {
1190 RemoveDeps.clear();
1191 // Set to true if the instruction has an operand defined by a Phi.
1192 Register HasPhiUse;
1193 Register HasPhiDef;
1194 MachineInstr *MI = I.getInstr();
1195 // Iterate over each operand, and we process the definitions.
1196 for (const MachineOperand &MO : MI->operands()) {
1197 if (!MO.isReg())
1198 continue;
1199 Register Reg = MO.getReg();
1200 if (MO.isDef()) {
1201 // If the register is used by a Phi, then create an anti dependence.
1203 UI = MRI.use_instr_begin(Reg),
1204 UE = MRI.use_instr_end();
1205 UI != UE; ++UI) {
1206 MachineInstr *UseMI = &*UI;
1207 SUnit *SU = getSUnit(UseMI);
1208 if (SU != nullptr && UseMI->isPHI()) {
1209 if (!MI->isPHI()) {
1210 SDep Dep(SU, SDep::Anti, Reg);
1211 Dep.setLatency(1);
1212 I.addPred(Dep);
1213 } else {
1214 HasPhiDef = Reg;
1215 // Add a chain edge to a dependent Phi that isn't an existing
1216 // predecessor.
1217 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1218 I.addPred(SDep(SU, SDep::Barrier));
1219 }
1220 }
1221 }
1222 } else if (MO.isUse()) {
1223 // If the register is defined by a Phi, then create a true dependence.
1224 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1225 if (DefMI == nullptr)
1226 continue;
1227 SUnit *SU = getSUnit(DefMI);
1228 if (SU != nullptr && DefMI->isPHI()) {
1229 if (!MI->isPHI()) {
1230 SDep Dep(SU, SDep::Data, Reg);
1231 Dep.setLatency(0);
1232 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1233 &SchedModel);
1234 I.addPred(Dep);
1235 } else {
1236 HasPhiUse = Reg;
1237 // Add a chain edge to a dependent Phi that isn't an existing
1238 // predecessor.
1239 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1240 I.addPred(SDep(SU, SDep::Barrier));
1241 }
1242 }
1243 }
1244 }
1245 // Remove order dependences from an unrelated Phi.
1246 if (!SwpPruneDeps)
1247 continue;
1248 for (auto &PI : I.Preds) {
1249 MachineInstr *PMI = PI.getSUnit()->getInstr();
1250 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1251 if (I.getInstr()->isPHI()) {
1252 if (PMI->getOperand(0).getReg() == HasPhiUse)
1253 continue;
1254 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1255 continue;
1256 }
1257 RemoveDeps.push_back(PI);
1258 }
1259 }
1260 for (const SDep &D : RemoveDeps)
1261 I.removePred(D);
1262 }
1263}
1264
1265/// Iterate over each DAG node and see if we can change any dependences
1266/// in order to reduce the recurrence MII.
1267void SwingSchedulerDAG::changeDependences() {
1268 // See if an instruction can use a value from the previous iteration.
1269 // If so, we update the base and offset of the instruction and change
1270 // the dependences.
1271 for (SUnit &I : SUnits) {
1272 unsigned BasePos = 0, OffsetPos = 0;
1273 Register NewBase;
1274 int64_t NewOffset = 0;
1275 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1276 NewOffset))
1277 continue;
1278
1279 // Get the MI and SUnit for the instruction that defines the original base.
1280 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1281 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1282 if (!DefMI)
1283 continue;
1284 SUnit *DefSU = getSUnit(DefMI);
1285 if (!DefSU)
1286 continue;
1287 // Get the MI and SUnit for the instruction that defins the new base.
1288 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1289 if (!LastMI)
1290 continue;
1291 SUnit *LastSU = getSUnit(LastMI);
1292 if (!LastSU)
1293 continue;
1294
1295 if (Topo.IsReachable(&I, LastSU))
1296 continue;
1297
1298 // Remove the dependence. The value now depends on a prior iteration.
1300 for (const SDep &P : I.Preds)
1301 if (P.getSUnit() == DefSU)
1302 Deps.push_back(P);
1303 for (const SDep &D : Deps) {
1304 Topo.RemovePred(&I, D.getSUnit());
1305 I.removePred(D);
1306 }
1307 // Remove the chain dependence between the instructions.
1308 Deps.clear();
1309 for (auto &P : LastSU->Preds)
1310 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1311 Deps.push_back(P);
1312 for (const SDep &D : Deps) {
1313 Topo.RemovePred(LastSU, D.getSUnit());
1314 LastSU->removePred(D);
1315 }
1316
1317 // Add a dependence between the new instruction and the instruction
1318 // that defines the new base.
1319 SDep Dep(&I, SDep::Anti, NewBase);
1320 Topo.AddPred(LastSU, &I);
1321 LastSU->addPred(Dep);
1322
1323 // Remember the base and offset information so that we can update the
1324 // instruction during code generation.
1325 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1326 }
1327}
1328
1329/// Create an instruction stream that represents a single iteration and stage of
1330/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1331/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1332/// function is an approximation of SMSchedule::finalizeSchedule with all
1333/// non-const operations removed.
1335 SMSchedule &Schedule,
1336 std::vector<MachineInstr *> &OrderedInsts,
1339
1340 // Move all instructions to the first stage from the later stages.
1341 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1342 ++Cycle) {
1343 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1344 Stage <= LastStage; ++Stage) {
1345 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1346 Cycle + Stage * Schedule.getInitiationInterval()))) {
1347 Instrs[Cycle].push_front(SU);
1348 }
1349 }
1350 }
1351
1352 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1353 ++Cycle) {
1354 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1355 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1356 for (SUnit *SU : CycleInstrs) {
1357 MachineInstr *MI = SU->getInstr();
1358 OrderedInsts.push_back(MI);
1359 Stages[MI] = Schedule.stageScheduled(SU);
1360 }
1361 }
1362}
1363
1364namespace {
1365
1366// FuncUnitSorter - Comparison operator used to sort instructions by
1367// the number of functional unit choices.
1368struct FuncUnitSorter {
1369 const InstrItineraryData *InstrItins;
1370 const MCSubtargetInfo *STI;
1371 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1372
1373 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1374 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1375
1376 // Compute the number of functional unit alternatives needed
1377 // at each stage, and take the minimum value. We prioritize the
1378 // instructions by the least number of choices first.
1379 unsigned minFuncUnits(const MachineInstr *Inst,
1380 InstrStage::FuncUnits &F) const {
1381 unsigned SchedClass = Inst->getDesc().getSchedClass();
1382 unsigned min = UINT_MAX;
1383 if (InstrItins && !InstrItins->isEmpty()) {
1384 for (const InstrStage &IS :
1385 make_range(InstrItins->beginStage(SchedClass),
1386 InstrItins->endStage(SchedClass))) {
1387 InstrStage::FuncUnits funcUnits = IS.getUnits();
1388 unsigned numAlternatives = llvm::popcount(funcUnits);
1389 if (numAlternatives < min) {
1390 min = numAlternatives;
1391 F = funcUnits;
1392 }
1393 }
1394 return min;
1395 }
1396 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1397 const MCSchedClassDesc *SCDesc =
1398 STI->getSchedModel().getSchedClassDesc(SchedClass);
1399 if (!SCDesc->isValid())
1400 // No valid Schedule Class Desc for schedClass, should be
1401 // Pseudo/PostRAPseudo
1402 return min;
1403
1404 for (const MCWriteProcResEntry &PRE :
1405 make_range(STI->getWriteProcResBegin(SCDesc),
1406 STI->getWriteProcResEnd(SCDesc))) {
1407 if (!PRE.ReleaseAtCycle)
1408 continue;
1409 const MCProcResourceDesc *ProcResource =
1410 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1411 unsigned NumUnits = ProcResource->NumUnits;
1412 if (NumUnits < min) {
1413 min = NumUnits;
1414 F = PRE.ProcResourceIdx;
1415 }
1416 }
1417 return min;
1418 }
1419 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1420 }
1421
1422 // Compute the critical resources needed by the instruction. This
1423 // function records the functional units needed by instructions that
1424 // must use only one functional unit. We use this as a tie breaker
1425 // for computing the resource MII. The instrutions that require
1426 // the same, highly used, functional unit have high priority.
1427 void calcCriticalResources(MachineInstr &MI) {
1428 unsigned SchedClass = MI.getDesc().getSchedClass();
1429 if (InstrItins && !InstrItins->isEmpty()) {
1430 for (const InstrStage &IS :
1431 make_range(InstrItins->beginStage(SchedClass),
1432 InstrItins->endStage(SchedClass))) {
1433 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1434 if (llvm::popcount(FuncUnits) == 1)
1435 Resources[FuncUnits]++;
1436 }
1437 return;
1438 }
1439 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1440 const MCSchedClassDesc *SCDesc =
1441 STI->getSchedModel().getSchedClassDesc(SchedClass);
1442 if (!SCDesc->isValid())
1443 // No valid Schedule Class Desc for schedClass, should be
1444 // Pseudo/PostRAPseudo
1445 return;
1446
1447 for (const MCWriteProcResEntry &PRE :
1448 make_range(STI->getWriteProcResBegin(SCDesc),
1449 STI->getWriteProcResEnd(SCDesc))) {
1450 if (!PRE.ReleaseAtCycle)
1451 continue;
1452 Resources[PRE.ProcResourceIdx]++;
1453 }
1454 return;
1455 }
1456 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1457 }
1458
1459 /// Return true if IS1 has less priority than IS2.
1460 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1461 InstrStage::FuncUnits F1 = 0, F2 = 0;
1462 unsigned MFUs1 = minFuncUnits(IS1, F1);
1463 unsigned MFUs2 = minFuncUnits(IS2, F2);
1464 if (MFUs1 == MFUs2)
1465 return Resources.lookup(F1) < Resources.lookup(F2);
1466 return MFUs1 > MFUs2;
1467 }
1468};
1469
1470/// Calculate the maximum register pressure of the scheduled instructions stream
1471class HighRegisterPressureDetector {
1472 MachineBasicBlock *OrigMBB;
1473 const MachineRegisterInfo &MRI;
1474 const TargetRegisterInfo *TRI;
1475
1476 const unsigned PSetNum;
1477
1478 // Indexed by PSet ID
1479 // InitSetPressure takes into account the register pressure of live-in
1480 // registers. It's not depend on how the loop is scheduled, so it's enough to
1481 // calculate them once at the beginning.
1482 std::vector<unsigned> InitSetPressure;
1483
1484 // Indexed by PSet ID
1485 // Upper limit for each register pressure set
1486 std::vector<unsigned> PressureSetLimit;
1487
1488 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1489
1490 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1491
1492public:
1493 using OrderedInstsTy = std::vector<MachineInstr *>;
1494 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1495
1496private:
1497 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1498 if (Pressures.size() == 0) {
1499 dbgs() << "[]";
1500 } else {
1501 char Prefix = '[';
1502 for (unsigned P : Pressures) {
1503 dbgs() << Prefix << P;
1504 Prefix = ' ';
1505 }
1506 dbgs() << ']';
1507 }
1508 }
1509
1510 void dumpPSet(Register Reg) const {
1511 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1512 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1513 ++PSetIter) {
1514 dbgs() << *PSetIter << ' ';
1515 }
1516 dbgs() << '\n';
1517 }
1518
1519 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1520 Register Reg) const {
1521 auto PSetIter = MRI.getPressureSets(Reg);
1522 unsigned Weight = PSetIter.getWeight();
1523 for (; PSetIter.isValid(); ++PSetIter)
1524 Pressure[*PSetIter] += Weight;
1525 }
1526
1527 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1528 Register Reg) const {
1529 auto PSetIter = MRI.getPressureSets(Reg);
1530 unsigned Weight = PSetIter.getWeight();
1531 for (; PSetIter.isValid(); ++PSetIter) {
1532 auto &P = Pressure[*PSetIter];
1533 assert(P >= Weight &&
1534 "register pressure must be greater than or equal weight");
1535 P -= Weight;
1536 }
1537 }
1538
1539 // Return true if Reg is reserved one, for example, stack pointer
1540 bool isReservedRegister(Register Reg) const {
1541 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1542 }
1543
1544 bool isDefinedInThisLoop(Register Reg) const {
1545 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1546 }
1547
1548 // Search for live-in variables. They are factored into the register pressure
1549 // from the begining. Live-in variables used by every iteration should be
1550 // considered as alive throughout the loop. For example, the variable `c` in
1551 // following code. \code
1552 // int c = ...;
1553 // for (int i = 0; i < n; i++)
1554 // a[i] += b[i] + c;
1555 // \endcode
1556 void computeLiveIn() {
1557 DenseSet<Register> Used;
1558 for (auto &MI : *OrigMBB) {
1559 if (MI.isDebugInstr())
1560 continue;
1561 for (auto &Use : ROMap[&MI].Uses) {
1562 auto Reg = Use.RegUnit;
1563 // Ignore the variable that appears only on one side of phi instruction
1564 // because it's used only at the first iteration.
1565 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1566 continue;
1567 if (isReservedRegister(Reg))
1568 continue;
1569 if (isDefinedInThisLoop(Reg))
1570 continue;
1571 Used.insert(Reg);
1572 }
1573 }
1574
1575 for (auto LiveIn : Used)
1576 increaseRegisterPressure(InitSetPressure, LiveIn);
1577 }
1578
1579 // Calculate the upper limit of each pressure set
1580 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1581 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1582 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1583 }
1584
1585 // There are two patterns of last-use.
1586 // - by an instruction of the current iteration
1587 // - by a phi instruction of the next iteration (loop carried value)
1588 //
1589 // Furthermore, following two groups of instructions are executed
1590 // simultaneously
1591 // - next iteration's phi instructions in i-th stage
1592 // - current iteration's instructions in i+1-th stage
1593 //
1594 // This function calculates the last-use of each register while taking into
1595 // account the above two patterns.
1596 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1597 Instr2StageTy &Stages) const {
1598 // We treat virtual registers that are defined and used in this loop.
1599 // Following virtual register will be ignored
1600 // - live-in one
1601 // - defined but not used in the loop (potentially live-out)
1602 DenseSet<Register> TargetRegs;
1603 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1604 if (isDefinedInThisLoop(Reg))
1605 TargetRegs.insert(Reg);
1606 };
1607 for (MachineInstr *MI : OrderedInsts) {
1608 if (MI->isPHI()) {
1609 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1610 UpdateTargetRegs(Reg);
1611 } else {
1612 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1613 UpdateTargetRegs(Use.RegUnit);
1614 }
1615 }
1616
1617 const auto InstrScore = [&Stages](MachineInstr *MI) {
1618 return Stages[MI] + MI->isPHI();
1619 };
1620
1621 DenseMap<Register, MachineInstr *> LastUseMI;
1622 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1623 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1624 auto Reg = Use.RegUnit;
1625 if (!TargetRegs.contains(Reg))
1626 continue;
1627 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1628 if (!Inserted) {
1629 MachineInstr *Orig = Ite->second;
1630 MachineInstr *New = MI;
1631 if (InstrScore(Orig) < InstrScore(New))
1632 Ite->second = New;
1633 }
1634 }
1635 }
1636
1637 Instr2LastUsesTy LastUses;
1638 for (auto &Entry : LastUseMI)
1639 LastUses[Entry.second].insert(Entry.first);
1640 return LastUses;
1641 }
1642
1643 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1644 // iterations and check the register pressure at the point where all stages
1645 // overlapping.
1646 //
1647 // An example of unrolled loop where #Stage is 4..
1648 // Iter i+0 i+1 i+2 i+3
1649 // ------------------------
1650 // Stage 0
1651 // Stage 1 0
1652 // Stage 2 1 0
1653 // Stage 3 2 1 0 <- All stages overlap
1654 //
1655 std::vector<unsigned>
1656 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1657 Instr2StageTy &Stages,
1658 const unsigned StageCount) const {
1659 using RegSetTy = SmallDenseSet<Register, 16>;
1660
1661 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1662 // manage the liveness of the registers independently by iterations.
1663 SmallVector<RegSetTy> LiveRegSets(StageCount);
1664
1665 auto CurSetPressure = InitSetPressure;
1666 auto MaxSetPressure = InitSetPressure;
1667 auto LastUses = computeLastUses(OrderedInsts, Stages);
1668
1669 LLVM_DEBUG({
1670 dbgs() << "Ordered instructions:\n";
1671 for (MachineInstr *MI : OrderedInsts) {
1672 dbgs() << "Stage " << Stages[MI] << ": ";
1673 MI->dump();
1674 }
1675 });
1676
1677 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1678 Register Reg) {
1679 if (!Reg.isValid() || isReservedRegister(Reg))
1680 return;
1681
1682 bool Inserted = RegSet.insert(Reg).second;
1683 if (!Inserted)
1684 return;
1685
1686 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1687 increaseRegisterPressure(CurSetPressure, Reg);
1688 LLVM_DEBUG(dumpPSet(Reg));
1689 };
1690
1691 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1692 Register Reg) {
1693 if (!Reg.isValid() || isReservedRegister(Reg))
1694 return;
1695
1696 // live-in register
1697 if (!RegSet.contains(Reg))
1698 return;
1699
1700 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1701 RegSet.erase(Reg);
1702 decreaseRegisterPressure(CurSetPressure, Reg);
1703 LLVM_DEBUG(dumpPSet(Reg));
1704 };
1705
1706 for (unsigned I = 0; I < StageCount; I++) {
1707 for (MachineInstr *MI : OrderedInsts) {
1708 const auto Stage = Stages[MI];
1709 if (I < Stage)
1710 continue;
1711
1712 const unsigned Iter = I - Stage;
1713
1714 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1715 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1716
1717 for (auto LastUse : LastUses[MI]) {
1718 if (MI->isPHI()) {
1719 if (Iter != 0)
1720 EraseReg(LiveRegSets[Iter - 1], LastUse);
1721 } else {
1722 EraseReg(LiveRegSets[Iter], LastUse);
1723 }
1724 }
1725
1726 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1727 MaxSetPressure[PSet] =
1728 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1729
1730 LLVM_DEBUG({
1731 dbgs() << "CurSetPressure=";
1732 dumpRegisterPressures(CurSetPressure);
1733 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1734 MI->dump();
1735 });
1736 }
1737 }
1738
1739 return MaxSetPressure;
1740 }
1741
1742public:
1743 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1744 const MachineFunction &MF)
1745 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1746 TRI(MF.getSubtarget().getRegisterInfo()),
1747 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1748 PressureSetLimit(PSetNum, 0) {}
1749
1750 // Used to calculate register pressure, which is independent of loop
1751 // scheduling.
1752 void init(const RegisterClassInfo &RCI) {
1753 for (MachineInstr &MI : *OrigMBB) {
1754 if (MI.isDebugInstr())
1755 continue;
1756 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1757 }
1758
1759 computeLiveIn();
1760 computePressureSetLimit(RCI);
1761 }
1762
1763 // Calculate the maximum register pressures of the loop and check if they
1764 // exceed the limit
1765 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1766 const unsigned MaxStage) const {
1768 "the percentage of the margin must be between 0 to 100");
1769
1770 OrderedInstsTy OrderedInsts;
1771 Instr2StageTy Stages;
1772 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1773 const auto MaxSetPressure =
1774 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1775
1776 LLVM_DEBUG({
1777 dbgs() << "Dump MaxSetPressure:\n";
1778 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1779 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1780 }
1781 dbgs() << '\n';
1782 });
1783
1784 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1785 unsigned Limit = PressureSetLimit[PSet];
1786 unsigned Margin = Limit * RegPressureMargin / 100;
1787 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1788 << " Margin=" << Margin << "\n");
1789 if (Limit < MaxSetPressure[PSet] + Margin) {
1790 LLVM_DEBUG(
1791 dbgs()
1792 << "Rejected the schedule because of too high register pressure\n");
1793 return true;
1794 }
1795 }
1796 return false;
1797 }
1798};
1799
1800} // end anonymous namespace
1801
1802/// Calculate the resource constrained minimum initiation interval for the
1803/// specified loop. We use the DFA to model the resources needed for
1804/// each instruction, and we ignore dependences. A different DFA is created
1805/// for each cycle that is required. When adding a new instruction, we attempt
1806/// to add it to each existing DFA, until a legal space is found. If the
1807/// instruction cannot be reserved in an existing DFA, we create a new one.
1808unsigned SwingSchedulerDAG::calculateResMII() {
1809 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1810 ResourceManager RM(&MF.getSubtarget(), this);
1811 return RM.calculateResMII();
1812}
1813
1814/// Calculate the recurrence-constrainted minimum initiation interval.
1815/// Iterate over each circuit. Compute the delay(c) and distance(c)
1816/// for each circuit. The II needs to satisfy the inequality
1817/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1818/// II that satisfies the inequality, and the RecMII is the maximum
1819/// of those values.
1820unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1821 unsigned RecMII = 0;
1822
1823 for (NodeSet &Nodes : NodeSets) {
1824 if (Nodes.empty())
1825 continue;
1826
1827 unsigned Delay = Nodes.getLatency();
1828 unsigned Distance = 1;
1829
1830 // ii = ceil(delay / distance)
1831 unsigned CurMII = (Delay + Distance - 1) / Distance;
1832 Nodes.setRecMII(CurMII);
1833 if (CurMII > RecMII)
1834 RecMII = CurMII;
1835 }
1836
1837 return RecMII;
1838}
1839
1840/// Create the adjacency structure of the nodes in the graph.
1841void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1842 SwingSchedulerDAG *DAG) {
1843 BitVector Added(SUnits.size());
1844 DenseMap<int, int> OutputDeps;
1845 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1846 Added.reset();
1847 // Add any successor to the adjacency matrix and exclude duplicates.
1848 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1849 // Only create a back-edge on the first and last nodes of a dependence
1850 // chain. This records any chains and adds them later.
1851 if (OE.isOutputDep()) {
1852 int N = OE.getDst()->NodeNum;
1853 int BackEdge = i;
1854 auto Dep = OutputDeps.find(BackEdge);
1855 if (Dep != OutputDeps.end()) {
1856 BackEdge = Dep->second;
1857 OutputDeps.erase(Dep);
1858 }
1859 OutputDeps[N] = BackEdge;
1860 }
1861 // Do not process a boundary node, an artificial node.
1862 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1863 continue;
1864
1865 // This code is retained o preserve previous behavior and prevent
1866 // regression. This condition means that anti-dependnecies within an
1867 // iteration are ignored when searching circuits. Therefore it's natural
1868 // to consider this dependence as well.
1869 // FIXME: Remove this code if it doesn't have significant impact on
1870 // performance.
1871 if (OE.isAntiDep())
1872 continue;
1873
1874 int N = OE.getDst()->NodeNum;
1875 if (!Added.test(N)) {
1876 AdjK[i].push_back(N);
1877 Added.set(N);
1878 }
1879 }
1880 // A chain edge between a store and a load is treated as a back-edge in the
1881 // adjacency matrix.
1882 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1883 SUnit *Src = IE.getSrc();
1884 SUnit *Dst = IE.getDst();
1885 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1886 continue;
1887 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1888 int N = Src->NodeNum;
1889 if (!Added.test(N)) {
1890 AdjK[i].push_back(N);
1891 Added.set(N);
1892 }
1893 }
1894 }
1895 }
1896 // Add back-edges in the adjacency matrix for the output dependences.
1897 for (auto &OD : OutputDeps)
1898 if (!Added.test(OD.second)) {
1899 AdjK[OD.first].push_back(OD.second);
1900 Added.set(OD.second);
1901 }
1902}
1903
1904/// Identify an elementary circuit in the dependence graph starting at the
1905/// specified node.
1906bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1907 const SwingSchedulerDAG *DAG,
1908 bool HasBackedge) {
1909 SUnit *SV = &SUnits[V];
1910 bool F = false;
1911 Stack.insert(SV);
1912 Blocked.set(V);
1913
1914 for (auto W : AdjK[V]) {
1915 if (NumPaths > MaxPaths)
1916 break;
1917 if (W < S)
1918 continue;
1919 if (W == S) {
1920 if (!HasBackedge)
1921 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1922 F = true;
1923 ++NumPaths;
1924 break;
1925 }
1926 if (!Blocked.test(W)) {
1927 if (circuit(W, S, NodeSets, DAG,
1928 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1929 F = true;
1930 }
1931 }
1932
1933 if (F)
1934 unblock(V);
1935 else {
1936 for (auto W : AdjK[V]) {
1937 if (W < S)
1938 continue;
1939 B[W].insert(SV);
1940 }
1941 }
1942 Stack.pop_back();
1943 return F;
1944}
1945
1946/// Unblock a node in the circuit finding algorithm.
1947void SwingSchedulerDAG::Circuits::unblock(int U) {
1948 Blocked.reset(U);
1949 SmallPtrSet<SUnit *, 4> &BU = B[U];
1950 while (!BU.empty()) {
1951 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1952 assert(SI != BU.end() && "Invalid B set.");
1953 SUnit *W = *SI;
1954 BU.erase(W);
1955 if (Blocked.test(W->NodeNum))
1956 unblock(W->NodeNum);
1957 }
1958}
1959
1960/// Identify all the elementary circuits in the dependence graph using
1961/// Johnson's circuit algorithm.
1962void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1963 Circuits Cir(SUnits, Topo);
1964 // Create the adjacency structure.
1965 Cir.createAdjacencyStructure(this);
1966 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1967 Cir.reset();
1968 Cir.circuit(I, I, NodeSets, this);
1969 }
1970}
1971
1972// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1973// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1974// additional copies that are needed across iterations. An artificial dependence
1975// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1976
1977// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1978// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1979// PHI-------True-Dep------> USEOfPhi
1980
1981// The mutation creates
1982// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1983
1984// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1985// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1986// late to avoid additional copies across iterations. The possible scheduling
1987// order would be
1988// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1989
1990void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1991 for (SUnit &SU : DAG->SUnits) {
1992 // Find the COPY/REG_SEQUENCE instruction.
1993 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1994 continue;
1995
1996 // Record the loop carried PHIs.
1998 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2000
2001 for (auto &Dep : SU.Preds) {
2002 SUnit *TmpSU = Dep.getSUnit();
2003 MachineInstr *TmpMI = TmpSU->getInstr();
2004 SDep::Kind DepKind = Dep.getKind();
2005 // Save the loop carried PHI.
2006 if (DepKind == SDep::Anti && TmpMI->isPHI())
2007 PHISUs.push_back(TmpSU);
2008 // Save the source of COPY/REG_SEQUENCE.
2009 // If the source has no pre-decessors, we will end up creating cycles.
2010 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2011 SrcSUs.push_back(TmpSU);
2012 }
2013
2014 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2015 continue;
2016
2017 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2018 // SUnit to the container.
2020 // Do not use iterator based loop here as we are updating the container.
2021 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2022 for (auto &Dep : PHISUs[Index]->Succs) {
2023 if (Dep.getKind() != SDep::Data)
2024 continue;
2025
2026 SUnit *TmpSU = Dep.getSUnit();
2027 MachineInstr *TmpMI = TmpSU->getInstr();
2028 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2029 PHISUs.push_back(TmpSU);
2030 continue;
2031 }
2032 UseSUs.push_back(TmpSU);
2033 }
2034 }
2035
2036 if (UseSUs.size() == 0)
2037 continue;
2038
2039 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2040 // Add the artificial dependencies if it does not form a cycle.
2041 for (auto *I : UseSUs) {
2042 for (auto *Src : SrcSUs) {
2043 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2044 Src->addPred(SDep(I, SDep::Artificial));
2045 SDAG->Topo.AddPred(Src, I);
2046 }
2047 }
2048 }
2049 }
2050}
2051
2052/// Compute several functions need to order the nodes for scheduling.
2053/// ASAP - Earliest time to schedule a node.
2054/// ALAP - Latest time to schedule a node.
2055/// MOV - Mobility function, difference between ALAP and ASAP.
2056/// D - Depth of each node.
2057/// H - Height of each node.
2058void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2059 ScheduleInfo.resize(SUnits.size());
2060
2061 LLVM_DEBUG({
2062 for (int I : Topo) {
2063 const SUnit &SU = SUnits[I];
2064 dumpNode(SU);
2065 }
2066 });
2067
2068 int maxASAP = 0;
2069 // Compute ASAP and ZeroLatencyDepth.
2070 for (int I : Topo) {
2071 int asap = 0;
2072 int zeroLatencyDepth = 0;
2073 SUnit *SU = &SUnits[I];
2074 for (const auto &IE : DDG->getInEdges(SU)) {
2075 SUnit *Pred = IE.getSrc();
2076 if (IE.getLatency() == 0)
2077 zeroLatencyDepth =
2078 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2079 if (IE.ignoreDependence(true))
2080 continue;
2081 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2082 IE.getDistance() * MII));
2083 }
2084 maxASAP = std::max(maxASAP, asap);
2085 ScheduleInfo[I].ASAP = asap;
2086 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2087 }
2088
2089 // Compute ALAP, ZeroLatencyHeight, and MOV.
2090 for (int I : llvm::reverse(Topo)) {
2091 int alap = maxASAP;
2092 int zeroLatencyHeight = 0;
2093 SUnit *SU = &SUnits[I];
2094 for (const auto &OE : DDG->getOutEdges(SU)) {
2095 SUnit *Succ = OE.getDst();
2096 if (Succ->isBoundaryNode())
2097 continue;
2098 if (OE.getLatency() == 0)
2099 zeroLatencyHeight =
2100 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2101 if (OE.ignoreDependence(true))
2102 continue;
2103 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2104 OE.getDistance() * MII));
2105 }
2106
2107 ScheduleInfo[I].ALAP = alap;
2108 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2109 }
2110
2111 // After computing the node functions, compute the summary for each node set.
2112 for (NodeSet &I : NodeSets)
2113 I.computeNodeSetInfo(this);
2114
2115 LLVM_DEBUG({
2116 for (unsigned i = 0; i < SUnits.size(); i++) {
2117 dbgs() << "\tNode " << i << ":\n";
2118 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2119 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2120 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2121 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2122 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2123 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2124 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2125 }
2126 });
2127}
2128
2129/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2130/// as the predecessors of the elements of NodeOrder that are not also in
2131/// NodeOrder.
2134 const NodeSet *S = nullptr) {
2135 Preds.clear();
2136
2137 for (SUnit *SU : NodeOrder) {
2138 for (const auto &IE : DDG->getInEdges(SU)) {
2139 SUnit *PredSU = IE.getSrc();
2140 if (S && S->count(PredSU) == 0)
2141 continue;
2142 if (IE.ignoreDependence(true))
2143 continue;
2144 if (NodeOrder.count(PredSU) == 0)
2145 Preds.insert(PredSU);
2146 }
2147
2148 // FIXME: The following loop-carried dependencies may also need to be
2149 // considered.
2150 // - Physical register dependencies (true-dependence and WAW).
2151 // - Memory dependencies.
2152 for (const auto &OE : DDG->getOutEdges(SU)) {
2153 SUnit *SuccSU = OE.getDst();
2154 if (!OE.isAntiDep())
2155 continue;
2156 if (S && S->count(SuccSU) == 0)
2157 continue;
2158 if (NodeOrder.count(SuccSU) == 0)
2159 Preds.insert(SuccSU);
2160 }
2161 }
2162 return !Preds.empty();
2163}
2164
2165/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2166/// as the successors of the elements of NodeOrder that are not also in
2167/// NodeOrder.
2170 const NodeSet *S = nullptr) {
2171 Succs.clear();
2172
2173 for (SUnit *SU : NodeOrder) {
2174 for (const auto &OE : DDG->getOutEdges(SU)) {
2175 SUnit *SuccSU = OE.getDst();
2176 if (S && S->count(SuccSU) == 0)
2177 continue;
2178 if (OE.ignoreDependence(false))
2179 continue;
2180 if (NodeOrder.count(SuccSU) == 0)
2181 Succs.insert(SuccSU);
2182 }
2183
2184 // FIXME: The following loop-carried dependencies may also need to be
2185 // considered.
2186 // - Physical register dependnecies (true-dependnece and WAW).
2187 // - Memory dependencies.
2188 for (const auto &IE : DDG->getInEdges(SU)) {
2189 SUnit *PredSU = IE.getSrc();
2190 if (!IE.isAntiDep())
2191 continue;
2192 if (S && S->count(PredSU) == 0)
2193 continue;
2194 if (NodeOrder.count(PredSU) == 0)
2195 Succs.insert(PredSU);
2196 }
2197 }
2198 return !Succs.empty();
2199}
2200
2201/// Return true if there is a path from the specified node to any of the nodes
2202/// in DestNodes. Keep track and return the nodes in any path.
2203static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2204 SetVector<SUnit *> &DestNodes,
2205 SetVector<SUnit *> &Exclude,
2206 SmallPtrSet<SUnit *, 8> &Visited,
2207 SwingSchedulerDDG *DDG) {
2208 if (Cur->isBoundaryNode())
2209 return false;
2210 if (Exclude.contains(Cur))
2211 return false;
2212 if (DestNodes.contains(Cur))
2213 return true;
2214 if (!Visited.insert(Cur).second)
2215 return Path.contains(Cur);
2216 bool FoundPath = false;
2217 for (const auto &OE : DDG->getOutEdges(Cur))
2218 if (!OE.ignoreDependence(false))
2219 FoundPath |=
2220 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2221 for (const auto &IE : DDG->getInEdges(Cur))
2222 if (IE.isAntiDep() && IE.getDistance() == 0)
2223 FoundPath |=
2224 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2225 if (FoundPath)
2226 Path.insert(Cur);
2227 return FoundPath;
2228}
2229
2230/// Compute the live-out registers for the instructions in a node-set.
2231/// The live-out registers are those that are defined in the node-set,
2232/// but not used. Except for use operands of Phis.
2234 NodeSet &NS) {
2239 for (SUnit *SU : NS) {
2240 const MachineInstr *MI = SU->getInstr();
2241 if (MI->isPHI())
2242 continue;
2243 for (const MachineOperand &MO : MI->all_uses()) {
2244 Register Reg = MO.getReg();
2245 if (Reg.isVirtual())
2246 Uses.insert(Reg);
2247 else if (MRI.isAllocatable(Reg))
2248 Uses.insert_range(TRI->regunits(Reg.asMCReg()));
2249 }
2250 }
2251 for (SUnit *SU : NS)
2252 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2253 if (!MO.isDead()) {
2254 Register Reg = MO.getReg();
2255 if (Reg.isVirtual()) {
2256 if (!Uses.count(Reg))
2257 LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
2258 } else if (MRI.isAllocatable(Reg)) {
2259 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2260 if (!Uses.count(Unit))
2261 LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
2262 }
2263 }
2264 RPTracker.addLiveRegs(LiveOutRegs);
2265}
2266
2267/// A heuristic to filter nodes in recurrent node-sets if the register
2268/// pressure of a set is too high.
2269void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2270 for (auto &NS : NodeSets) {
2271 // Skip small node-sets since they won't cause register pressure problems.
2272 if (NS.size() <= 2)
2273 continue;
2274 IntervalPressure RecRegPressure;
2275 RegPressureTracker RecRPTracker(RecRegPressure);
2276 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2277 computeLiveOuts(MF, RecRPTracker, NS);
2278 RecRPTracker.closeBottom();
2279
2280 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2281 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2282 return A->NodeNum > B->NodeNum;
2283 });
2284
2285 for (auto &SU : SUnits) {
2286 // Since we're computing the register pressure for a subset of the
2287 // instructions in a block, we need to set the tracker for each
2288 // instruction in the node-set. The tracker is set to the instruction
2289 // just after the one we're interested in.
2291 RecRPTracker.setPos(std::next(CurInstI));
2292
2293 RegPressureDelta RPDelta;
2294 ArrayRef<PressureChange> CriticalPSets;
2295 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2296 CriticalPSets,
2297 RecRegPressure.MaxSetPressure);
2298 if (RPDelta.Excess.isValid()) {
2299 LLVM_DEBUG(
2300 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2301 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2302 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2303 NS.setExceedPressure(SU);
2304 break;
2305 }
2306 RecRPTracker.recede();
2307 }
2308 }
2309}
2310
2311/// A heuristic to colocate node sets that have the same set of
2312/// successors.
2313void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2314 unsigned Colocate = 0;
2315 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2316 NodeSet &N1 = NodeSets[i];
2317 SmallSetVector<SUnit *, 8> S1;
2318 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2319 continue;
2320 for (int j = i + 1; j < e; ++j) {
2321 NodeSet &N2 = NodeSets[j];
2322 if (N1.compareRecMII(N2) != 0)
2323 continue;
2324 SmallSetVector<SUnit *, 8> S2;
2325 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2326 continue;
2327 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2328 N1.setColocate(++Colocate);
2329 N2.setColocate(Colocate);
2330 break;
2331 }
2332 }
2333 }
2334}
2335
2336/// Check if the existing node-sets are profitable. If not, then ignore the
2337/// recurrent node-sets, and attempt to schedule all nodes together. This is
2338/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2339/// then it's best to try to schedule all instructions together instead of
2340/// starting with the recurrent node-sets.
2341void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2342 // Look for loops with a large MII.
2343 if (MII < 17)
2344 return;
2345 // Check if the node-set contains only a simple add recurrence.
2346 for (auto &NS : NodeSets) {
2347 if (NS.getRecMII() > 2)
2348 return;
2349 if (NS.getMaxDepth() > MII)
2350 return;
2351 }
2352 NodeSets.clear();
2353 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2354}
2355
2356/// Add the nodes that do not belong to a recurrence set into groups
2357/// based upon connected components.
2358void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2359 SetVector<SUnit *> NodesAdded;
2360 SmallPtrSet<SUnit *, 8> Visited;
2361 // Add the nodes that are on a path between the previous node sets and
2362 // the current node set.
2363 for (NodeSet &I : NodeSets) {
2364 SmallSetVector<SUnit *, 8> N;
2365 // Add the nodes from the current node set to the previous node set.
2366 if (succ_L(I, N, DDG.get())) {
2367 SetVector<SUnit *> Path;
2368 for (SUnit *NI : N) {
2369 Visited.clear();
2370 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2371 }
2372 if (!Path.empty())
2373 I.insert(Path.begin(), Path.end());
2374 }
2375 // Add the nodes from the previous node set to the current node set.
2376 N.clear();
2377 if (succ_L(NodesAdded, N, DDG.get())) {
2378 SetVector<SUnit *> Path;
2379 for (SUnit *NI : N) {
2380 Visited.clear();
2381 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2382 }
2383 if (!Path.empty())
2384 I.insert(Path.begin(), Path.end());
2385 }
2386 NodesAdded.insert_range(I);
2387 }
2388
2389 // Create a new node set with the connected nodes of any successor of a node
2390 // in a recurrent set.
2391 NodeSet NewSet;
2392 SmallSetVector<SUnit *, 8> N;
2393 if (succ_L(NodesAdded, N, DDG.get()))
2394 for (SUnit *I : N)
2395 addConnectedNodes(I, NewSet, NodesAdded);
2396 if (!NewSet.empty())
2397 NodeSets.push_back(NewSet);
2398
2399 // Create a new node set with the connected nodes of any predecessor of a node
2400 // in a recurrent set.
2401 NewSet.clear();
2402 if (pred_L(NodesAdded, N, DDG.get()))
2403 for (SUnit *I : N)
2404 addConnectedNodes(I, NewSet, NodesAdded);
2405 if (!NewSet.empty())
2406 NodeSets.push_back(NewSet);
2407
2408 // Create new nodes sets with the connected nodes any remaining node that
2409 // has no predecessor.
2410 for (SUnit &SU : SUnits) {
2411 if (NodesAdded.count(&SU) == 0) {
2412 NewSet.clear();
2413 addConnectedNodes(&SU, NewSet, NodesAdded);
2414 if (!NewSet.empty())
2415 NodeSets.push_back(NewSet);
2416 }
2417 }
2418}
2419
2420/// Add the node to the set, and add all of its connected nodes to the set.
2421void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2422 SetVector<SUnit *> &NodesAdded) {
2423 NewSet.insert(SU);
2424 NodesAdded.insert(SU);
2425 for (auto &OE : DDG->getOutEdges(SU)) {
2426 SUnit *Successor = OE.getDst();
2427 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2428 NodesAdded.count(Successor) == 0)
2429 addConnectedNodes(Successor, NewSet, NodesAdded);
2430 }
2431 for (auto &IE : DDG->getInEdges(SU)) {
2432 SUnit *Predecessor = IE.getSrc();
2433 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2434 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2435 }
2436}
2437
2438/// Return true if Set1 contains elements in Set2. The elements in common
2439/// are returned in a different container.
2440static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2442 Result.clear();
2443 for (SUnit *SU : Set1) {
2444 if (Set2.count(SU) != 0)
2445 Result.insert(SU);
2446 }
2447 return !Result.empty();
2448}
2449
2450/// Merge the recurrence node sets that have the same initial node.
2451void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2452 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2453 ++I) {
2454 NodeSet &NI = *I;
2455 for (NodeSetType::iterator J = I + 1; J != E;) {
2456 NodeSet &NJ = *J;
2457 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2458 if (NJ.compareRecMII(NI) > 0)
2459 NI.setRecMII(NJ.getRecMII());
2460 for (SUnit *SU : *J)
2461 I->insert(SU);
2462 NodeSets.erase(J);
2463 E = NodeSets.end();
2464 } else {
2465 ++J;
2466 }
2467 }
2468 }
2469}
2470
2471/// Remove nodes that have been scheduled in previous NodeSets.
2472void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2473 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2474 ++I)
2475 for (NodeSetType::iterator J = I + 1; J != E;) {
2476 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2477
2478 if (J->empty()) {
2479 NodeSets.erase(J);
2480 E = NodeSets.end();
2481 } else {
2482 ++J;
2483 }
2484 }
2485}
2486
2487/// Compute an ordered list of the dependence graph nodes, which
2488/// indicates the order that the nodes will be scheduled. This is a
2489/// two-level algorithm. First, a partial order is created, which
2490/// consists of a list of sets ordered from highest to lowest priority.
2491void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2492 SmallSetVector<SUnit *, 8> R;
2493 NodeOrder.clear();
2494
2495 for (auto &Nodes : NodeSets) {
2496 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2497 OrderKind Order;
2498 SmallSetVector<SUnit *, 8> N;
2499 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2500 R.insert_range(N);
2501 Order = BottomUp;
2502 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2503 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2504 llvm::set_is_subset(N, Nodes)) {
2505 R.insert_range(N);
2506 Order = TopDown;
2507 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2508 } else if (isIntersect(N, Nodes, R)) {
2509 // If some of the successors are in the existing node-set, then use the
2510 // top-down ordering.
2511 Order = TopDown;
2512 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2513 } else if (NodeSets.size() == 1) {
2514 for (const auto &N : Nodes)
2515 if (N->Succs.size() == 0)
2516 R.insert(N);
2517 Order = BottomUp;
2518 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2519 } else {
2520 // Find the node with the highest ASAP.
2521 SUnit *maxASAP = nullptr;
2522 for (SUnit *SU : Nodes) {
2523 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2524 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2525 maxASAP = SU;
2526 }
2527 R.insert(maxASAP);
2528 Order = BottomUp;
2529 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2530 }
2531
2532 while (!R.empty()) {
2533 if (Order == TopDown) {
2534 // Choose the node with the maximum height. If more than one, choose
2535 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2536 // choose the node with the lowest MOV.
2537 while (!R.empty()) {
2538 SUnit *maxHeight = nullptr;
2539 for (SUnit *I : R) {
2540 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2541 maxHeight = I;
2542 else if (getHeight(I) == getHeight(maxHeight) &&
2543 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2544 maxHeight = I;
2545 else if (getHeight(I) == getHeight(maxHeight) &&
2546 getZeroLatencyHeight(I) ==
2547 getZeroLatencyHeight(maxHeight) &&
2548 getMOV(I) < getMOV(maxHeight))
2549 maxHeight = I;
2550 }
2551 NodeOrder.insert(maxHeight);
2552 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2553 R.remove(maxHeight);
2554 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2555 SUnit *SU = OE.getDst();
2556 if (Nodes.count(SU) == 0)
2557 continue;
2558 if (NodeOrder.contains(SU))
2559 continue;
2560 if (OE.ignoreDependence(false))
2561 continue;
2562 R.insert(SU);
2563 }
2564
2565 // FIXME: The following loop-carried dependencies may also need to be
2566 // considered.
2567 // - Physical register dependnecies (true-dependnece and WAW).
2568 // - Memory dependencies.
2569 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2570 SUnit *SU = IE.getSrc();
2571 if (!IE.isAntiDep())
2572 continue;
2573 if (Nodes.count(SU) == 0)
2574 continue;
2575 if (NodeOrder.contains(SU))
2576 continue;
2577 R.insert(SU);
2578 }
2579 }
2580 Order = BottomUp;
2581 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2582 SmallSetVector<SUnit *, 8> N;
2583 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2584 R.insert_range(N);
2585 } else {
2586 // Choose the node with the maximum depth. If more than one, choose
2587 // the node with the maximum ZeroLatencyDepth. If still more than one,
2588 // choose the node with the lowest MOV.
2589 while (!R.empty()) {
2590 SUnit *maxDepth = nullptr;
2591 for (SUnit *I : R) {
2592 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2593 maxDepth = I;
2594 else if (getDepth(I) == getDepth(maxDepth) &&
2595 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2596 maxDepth = I;
2597 else if (getDepth(I) == getDepth(maxDepth) &&
2598 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2599 getMOV(I) < getMOV(maxDepth))
2600 maxDepth = I;
2601 }
2602 NodeOrder.insert(maxDepth);
2603 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2604 R.remove(maxDepth);
2605 if (Nodes.isExceedSU(maxDepth)) {
2606 Order = TopDown;
2607 R.clear();
2608 R.insert(Nodes.getNode(0));
2609 break;
2610 }
2611 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2612 SUnit *SU = IE.getSrc();
2613 if (Nodes.count(SU) == 0)
2614 continue;
2615 if (NodeOrder.contains(SU))
2616 continue;
2617 R.insert(SU);
2618 }
2619
2620 // FIXME: The following loop-carried dependencies may also need to be
2621 // considered.
2622 // - Physical register dependnecies (true-dependnece and WAW).
2623 // - Memory dependencies.
2624 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2625 SUnit *SU = OE.getDst();
2626 if (!OE.isAntiDep())
2627 continue;
2628 if (Nodes.count(SU) == 0)
2629 continue;
2630 if (NodeOrder.contains(SU))
2631 continue;
2632 R.insert(SU);
2633 }
2634 }
2635 Order = TopDown;
2636 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2637 SmallSetVector<SUnit *, 8> N;
2638 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2639 R.insert_range(N);
2640 }
2641 }
2642 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2643 }
2644
2645 LLVM_DEBUG({
2646 dbgs() << "Node order: ";
2647 for (SUnit *I : NodeOrder)
2648 dbgs() << " " << I->NodeNum << " ";
2649 dbgs() << "\n";
2650 });
2651}
2652
2653/// Process the nodes in the computed order and create the pipelined schedule
2654/// of the instructions, if possible. Return true if a schedule is found.
2655bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2656
2657 if (NodeOrder.empty()){
2658 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2659 return false;
2660 }
2661
2662 bool scheduleFound = false;
2663 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2664 if (LimitRegPressure) {
2665 HRPDetector =
2666 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2667 HRPDetector->init(RegClassInfo);
2668 }
2669 // Keep increasing II until a valid schedule is found.
2670 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2671 Schedule.reset();
2672 Schedule.setInitiationInterval(II);
2673 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2674
2677 do {
2678 SUnit *SU = *NI;
2679
2680 // Compute the schedule time for the instruction, which is based
2681 // upon the scheduled time for any predecessors/successors.
2682 int EarlyStart = INT_MIN;
2683 int LateStart = INT_MAX;
2684 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2685 LLVM_DEBUG({
2686 dbgs() << "\n";
2687 dbgs() << "Inst (" << SU->NodeNum << ") ";
2688 SU->getInstr()->dump();
2689 dbgs() << "\n";
2690 });
2691 LLVM_DEBUG(
2692 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2693
2694 if (EarlyStart > LateStart)
2695 scheduleFound = false;
2696 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2697 scheduleFound =
2698 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2699 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2700 scheduleFound =
2701 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2702 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2703 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2704 // When scheduling a Phi it is better to start at the late cycle and
2705 // go backwards. The default order may insert the Phi too far away
2706 // from its first dependence.
2707 // Also, do backward search when all scheduled predecessors are
2708 // loop-carried output/order dependencies. Empirically, there are also
2709 // cases where scheduling becomes possible with backward search.
2710 if (SU->getInstr()->isPHI() ||
2711 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2712 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2713 else
2714 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2715 } else {
2716 int FirstCycle = Schedule.getFirstCycle();
2717 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2718 FirstCycle + getASAP(SU) + II - 1, II);
2719 }
2720
2721 // Even if we find a schedule, make sure the schedule doesn't exceed the
2722 // allowable number of stages. We keep trying if this happens.
2723 if (scheduleFound)
2724 if (SwpMaxStages > -1 &&
2725 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2726 scheduleFound = false;
2727
2728 LLVM_DEBUG({
2729 if (!scheduleFound)
2730 dbgs() << "\tCan't schedule\n";
2731 });
2732 } while (++NI != NE && scheduleFound);
2733
2734 // If a schedule is found, validate it against the validation-only
2735 // dependencies.
2736 if (scheduleFound)
2737 scheduleFound = DDG->isValidSchedule(Schedule);
2738
2739 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2740 if (scheduleFound)
2741 scheduleFound =
2742 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2743
2744 // If a schedule is found, check if it is a valid schedule too.
2745 if (scheduleFound)
2746 scheduleFound = Schedule.isValidSchedule(this);
2747
2748 // If a schedule was found and the option is enabled, check if the schedule
2749 // might generate additional register spills/fills.
2750 if (scheduleFound && LimitRegPressure)
2751 scheduleFound =
2752 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2753 }
2754
2755 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2756 << " (II=" << Schedule.getInitiationInterval()
2757 << ")\n");
2758
2759 if (scheduleFound) {
2760 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2761 if (!scheduleFound)
2762 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2763 }
2764
2765 if (scheduleFound) {
2766 Schedule.finalizeSchedule(this);
2767 Pass.ORE->emit([&]() {
2768 return MachineOptimizationRemarkAnalysis(
2769 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2770 << "Schedule found with Initiation Interval: "
2771 << ore::NV("II", Schedule.getInitiationInterval())
2772 << ", MaxStageCount: "
2773 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2774 });
2775 } else
2776 Schedule.reset();
2777
2778 return scheduleFound && Schedule.getMaxStageCount() > 0;
2779}
2780
2782 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2783 Register Result;
2784 for (const MachineOperand &Use : MI.all_uses()) {
2785 Register Reg = Use.getReg();
2786 if (!Reg.isVirtual())
2787 return Register();
2788 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2789 continue;
2790 if (Result)
2791 return Register();
2792 Result = Reg;
2793 }
2794 return Result;
2795}
2796
2797/// When Op is a value that is incremented recursively in a loop and there is a
2798/// unique instruction that increments it, returns true and sets Value.
2800 if (!Op.isReg() || !Op.getReg().isVirtual())
2801 return false;
2802
2803 Register OrgReg = Op.getReg();
2804 Register CurReg = OrgReg;
2805 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2806 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2807
2808 const TargetInstrInfo *TII =
2809 LoopBB->getParent()->getSubtarget().getInstrInfo();
2810 const TargetRegisterInfo *TRI =
2811 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2812
2813 MachineInstr *Phi = nullptr;
2814 MachineInstr *Increment = nullptr;
2815
2816 // Traverse definitions until it reaches Op or an instruction that does not
2817 // satisfy the condition.
2818 // Acceptable example:
2819 // bb.0:
2820 // %0 = PHI %3, %bb.0, ...
2821 // %2 = ADD %0, Value
2822 // ... = LOAD %2(Op)
2823 // %3 = COPY %2
2824 while (true) {
2825 if (!CurReg.isValid() || !CurReg.isVirtual())
2826 return false;
2827 MachineInstr *Def = MRI.getVRegDef(CurReg);
2828 if (Def->getParent() != LoopBB)
2829 return false;
2830
2831 if (Def->isCopy()) {
2832 // Ignore copy instructions unless they contain subregisters
2833 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2834 return false;
2835 CurReg = Def->getOperand(1).getReg();
2836 } else if (Def->isPHI()) {
2837 // There must be just one Phi
2838 if (Phi)
2839 return false;
2840 Phi = Def;
2841 CurReg = getLoopPhiReg(*Def, LoopBB);
2842 } else if (TII->getIncrementValue(*Def, Value)) {
2843 // Potentially a unique increment
2844 if (Increment)
2845 // Multiple increments exist
2846 return false;
2847
2848 const MachineOperand *BaseOp;
2849 int64_t Offset;
2850 bool OffsetIsScalable;
2851 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2852 TRI)) {
2853 // Pre/post increment instruction
2854 CurReg = BaseOp->getReg();
2855 } else {
2856 // If only one of the operands is defined within the loop, it is assumed
2857 // to be an incremented value.
2858 CurReg = findUniqueOperandDefinedInLoop(*Def);
2859 if (!CurReg.isValid())
2860 return false;
2861 }
2862 Increment = Def;
2863 } else {
2864 return false;
2865 }
2866 if (CurReg == OrgReg)
2867 break;
2868 }
2869
2870 if (!Phi || !Increment)
2871 return false;
2872
2873 return true;
2874}
2875
2876/// Return true if we can compute the amount the instruction changes
2877/// during each iteration. Set Delta to the amount of the change.
2878bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2879 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2880 const MachineOperand *BaseOp;
2881 int64_t Offset;
2882 bool OffsetIsScalable;
2883 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2884 return false;
2885
2886 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2887 if (OffsetIsScalable)
2888 return false;
2889
2890 if (!BaseOp->isReg())
2891 return false;
2892
2893 return findLoopIncrementValue(*BaseOp, Delta);
2894}
2895
2896/// Check if we can change the instruction to use an offset value from the
2897/// previous iteration. If so, return true and set the base and offset values
2898/// so that we can rewrite the load, if necessary.
2899/// v1 = Phi(v0, v3)
2900/// v2 = load v1, 0
2901/// v3 = post_store v1, 4, x
2902/// This function enables the load to be rewritten as v2 = load v3, 4.
2903bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2904 unsigned &BasePos,
2905 unsigned &OffsetPos,
2906 Register &NewBase,
2907 int64_t &Offset) {
2908 // Get the load instruction.
2909 if (TII->isPostIncrement(*MI))
2910 return false;
2911 unsigned BasePosLd, OffsetPosLd;
2912 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2913 return false;
2914 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2915
2916 // Look for the Phi instruction.
2917 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2918 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2919 if (!Phi || !Phi->isPHI())
2920 return false;
2921 // Get the register defined in the loop block.
2922 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2923 if (!PrevReg)
2924 return false;
2925
2926 // Check for the post-increment load/store instruction.
2927 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2928 if (!PrevDef || PrevDef == MI)
2929 return false;
2930
2931 if (!TII->isPostIncrement(*PrevDef))
2932 return false;
2933
2934 unsigned BasePos1 = 0, OffsetPos1 = 0;
2935 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2936 return false;
2937
2938 // Make sure that the instructions do not access the same memory location in
2939 // the next iteration.
2940 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2941 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2942 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2943 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2944 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2945 MF.deleteMachineInstr(NewMI);
2946 if (!Disjoint)
2947 return false;
2948
2949 // Set the return value once we determine that we return true.
2950 BasePos = BasePosLd;
2951 OffsetPos = OffsetPosLd;
2952 NewBase = PrevReg;
2953 Offset = StoreOffset;
2954 return true;
2955}
2956
2957/// Apply changes to the instruction if needed. The changes are need
2958/// to improve the scheduling and depend up on the final schedule.
2960 SMSchedule &Schedule) {
2961 SUnit *SU = getSUnit(MI);
2963 InstrChanges.find(SU);
2964 if (It != InstrChanges.end()) {
2965 std::pair<Register, int64_t> RegAndOffset = It->second;
2966 unsigned BasePos, OffsetPos;
2967 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2968 return;
2969 Register BaseReg = MI->getOperand(BasePos).getReg();
2970 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2971 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2972 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2973 int BaseStageNum = Schedule.stageScheduled(SU);
2974 int BaseCycleNum = Schedule.cycleScheduled(SU);
2975 if (BaseStageNum < DefStageNum) {
2976 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2977 int OffsetDiff = DefStageNum - BaseStageNum;
2978 if (DefCycleNum < BaseCycleNum) {
2979 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2980 if (OffsetDiff > 0)
2981 --OffsetDiff;
2982 }
2983 int64_t NewOffset =
2984 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2985 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2986 SU->setInstr(NewMI);
2987 MISUnitMap[NewMI] = SU;
2988 NewMIs[MI] = NewMI;
2989 }
2990 }
2991}
2992
2993/// Return the instruction in the loop that defines the register.
2994/// If the definition is a Phi, then follow the Phi operand to
2995/// the instruction in the loop.
2996MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2998 MachineInstr *Def = MRI.getVRegDef(Reg);
2999 while (Def->isPHI()) {
3000 if (!Visited.insert(Def).second)
3001 break;
3002 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3003 if (Def->getOperand(i + 1).getMBB() == BB) {
3004 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3005 break;
3006 }
3007 }
3008 return Def;
3009}
3010
3011/// Return false if there is no overlap between the region accessed by BaseMI in
3012/// an iteration and the region accessed by OtherMI in subsequent iterations.
3014 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3015 int DeltaB, DeltaO, Delta;
3016 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3017 DeltaB != DeltaO)
3018 return true;
3019 Delta = DeltaB;
3020
3021 const MachineOperand *BaseOpB, *BaseOpO;
3022 int64_t OffsetB, OffsetO;
3023 bool OffsetBIsScalable, OffsetOIsScalable;
3024 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3025 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3026 OffsetBIsScalable, TRI) ||
3027 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3028 OffsetOIsScalable, TRI))
3029 return true;
3030
3031 if (OffsetBIsScalable || OffsetOIsScalable)
3032 return true;
3033
3034 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3035 // Pass cases with different base operands but same initial values.
3036 // Typically for when pre/post increment is used.
3037
3038 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3039 return true;
3040 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3041 if (!RegB.isVirtual() || !RegO.isVirtual())
3042 return true;
3043
3044 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3045 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3046 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3047 return true;
3048
3049 Register InitValB;
3050 Register LoopValB;
3051 Register InitValO;
3052 Register LoopValO;
3053 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3054 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3055 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3056 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3057
3058 if (!InitDefB->isIdenticalTo(*InitDefO))
3059 return true;
3060 }
3061
3062 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3063 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3064
3065 // This is the main test, which checks the offset values and the loop
3066 // increment value to determine if the accesses may be loop carried.
3067 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3068 return true;
3069
3070 LLVM_DEBUG({
3071 dbgs() << "Overlap check:\n";
3072 dbgs() << " BaseMI: ";
3073 BaseMI->dump();
3074 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3075 << ", Len: " << AccessSizeB.getValue() << "\n";
3076 dbgs() << " OtherMI: ";
3077 OtherMI->dump();
3078 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3079 << ", Len: " << AccessSizeO.getValue() << "\n";
3080 });
3081
3082 // Excessive overlap may be detected in strided patterns.
3083 // For example, the memory addresses of the store and the load in
3084 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3085 // are assumed to overlap.
3086 if (Delta < 0) {
3087 int64_t BaseMinAddr = OffsetB;
3088 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3089 if (BaseMinAddr > OhterNextIterMaxAddr) {
3090 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3091 return false;
3092 }
3093 } else {
3094 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3095 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3096 if (BaseMaxAddr < OtherNextIterMinAddr) {
3097 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3098 return false;
3099 }
3100 }
3101 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3102 return true;
3103}
3104
3105/// Return true for an order or output dependence that is loop carried
3106/// potentially. A dependence is loop carried if the destination defines a value
3107/// that may be used or defined by the source in a subsequent iteration.
3109 const SwingSchedulerDDGEdge &Edge) const {
3110 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3111 Edge.getDst()->isBoundaryNode())
3112 return false;
3113
3115 return true;
3116
3117 if (Edge.isOutputDep())
3118 return true;
3119
3120 MachineInstr *SI = Edge.getSrc()->getInstr();
3121 MachineInstr *DI = Edge.getDst()->getInstr();
3122 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3123
3124 // Assume ordered loads and stores may have a loop carried dependence.
3125 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3126 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3127 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3128 return true;
3129
3130 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3131 return false;
3132
3133 // The conservative assumption is that a dependence between memory operations
3134 // may be loop carried. The following code checks when it can be proved that
3135 // there is no loop carried dependence.
3136 return mayOverlapInLaterIter(DI, SI);
3137}
3138
3139void SwingSchedulerDAG::postProcessDAG() {
3140 for (auto &M : Mutations)
3141 M->apply(this);
3142}
3143
3144/// Try to schedule the node at the specified StartCycle and continue
3145/// until the node is schedule or the EndCycle is reached. This function
3146/// returns true if the node is scheduled. This routine may search either
3147/// forward or backward for a place to insert the instruction based upon
3148/// the relative values of StartCycle and EndCycle.
3149bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3150 bool forward = true;
3151 LLVM_DEBUG({
3152 dbgs() << "Trying to insert node between " << StartCycle << " and "
3153 << EndCycle << " II: " << II << "\n";
3154 });
3155 if (StartCycle > EndCycle)
3156 forward = false;
3157
3158 // The terminating condition depends on the direction.
3159 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3160 for (int curCycle = StartCycle; curCycle != termCycle;
3161 forward ? ++curCycle : --curCycle) {
3162
3163 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3164 ProcItinResources.canReserveResources(*SU, curCycle)) {
3165 LLVM_DEBUG({
3166 dbgs() << "\tinsert at cycle " << curCycle << " ";
3167 SU->getInstr()->dump();
3168 });
3169
3170 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3171 ProcItinResources.reserveResources(*SU, curCycle);
3172 ScheduledInstrs[curCycle].push_back(SU);
3173 InstrToCycle.insert(std::make_pair(SU, curCycle));
3174 if (curCycle > LastCycle)
3175 LastCycle = curCycle;
3176 if (curCycle < FirstCycle)
3177 FirstCycle = curCycle;
3178 return true;
3179 }
3180 LLVM_DEBUG({
3181 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3182 SU->getInstr()->dump();
3183 });
3184 }
3185 return false;
3186}
3187
3188// Return the cycle of the earliest scheduled instruction in the chain.
3190 const SwingSchedulerDDG *DDG) {
3193 Worklist.push_back(Dep);
3194 int EarlyCycle = INT_MAX;
3195 while (!Worklist.empty()) {
3196 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3197 SUnit *PrevSU = Cur.getSrc();
3198 if (Visited.count(PrevSU))
3199 continue;
3200 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3201 if (it == InstrToCycle.end())
3202 continue;
3203 EarlyCycle = std::min(EarlyCycle, it->second);
3204 for (const auto &IE : DDG->getInEdges(PrevSU))
3205 if (IE.isOrderDep() || IE.isOutputDep())
3206 Worklist.push_back(IE);
3207 Visited.insert(PrevSU);
3208 }
3209 return EarlyCycle;
3210}
3211
3212// Return the cycle of the latest scheduled instruction in the chain.
3214 const SwingSchedulerDDG *DDG) {
3217 Worklist.push_back(Dep);
3218 int LateCycle = INT_MIN;
3219 while (!Worklist.empty()) {
3220 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3221 SUnit *SuccSU = Cur.getDst();
3222 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3223 continue;
3224 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3225 if (it == InstrToCycle.end())
3226 continue;
3227 LateCycle = std::max(LateCycle, it->second);
3228 for (const auto &OE : DDG->getOutEdges(SuccSU))
3229 if (OE.isOrderDep() || OE.isOutputDep())
3230 Worklist.push_back(OE);
3231 Visited.insert(SuccSU);
3232 }
3233 return LateCycle;
3234}
3235
3236/// If an instruction has a use that spans multiple iterations, then
3237/// return true. These instructions are characterized by having a back-ege
3238/// to a Phi, which contains a reference to another Phi.
3240 for (auto &P : SU->Preds)
3241 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3242 for (auto &S : P.getSUnit()->Succs)
3243 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3244 return P.getSUnit();
3245 return nullptr;
3246}
3247
3248/// Compute the scheduling start slot for the instruction. The start slot
3249/// depends on any predecessor or successor nodes scheduled already.
3250void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3251 int II, SwingSchedulerDAG *DAG) {
3252 const SwingSchedulerDDG *DDG = DAG->getDDG();
3253
3254 // Iterate over each instruction that has been scheduled already. The start
3255 // slot computation depends on whether the previously scheduled instruction
3256 // is a predecessor or successor of the specified instruction.
3257 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3258 for (SUnit *I : getInstructions(cycle)) {
3259 for (const auto &IE : DDG->getInEdges(SU)) {
3260 if (IE.getSrc() == I) {
3261 // FIXME: Add reverse edge to `DDG` instead of calling
3262 // `isLoopCarriedDep`
3263 if (DAG->isLoopCarriedDep(IE)) {
3264 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3265 *MinLateStart = std::min(*MinLateStart, End);
3266 }
3267 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3268 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3269 }
3270 }
3271
3272 for (const auto &OE : DDG->getOutEdges(SU)) {
3273 if (OE.getDst() == I) {
3274 // FIXME: Add reverse edge to `DDG` instead of calling
3275 // `isLoopCarriedDep`
3276 if (DAG->isLoopCarriedDep(OE)) {
3277 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3278 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3279 }
3280 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3281 *MinLateStart = std::min(*MinLateStart, LateStart);
3282 }
3283 }
3284
3285 SUnit *BE = multipleIterations(I, DAG);
3286 for (const auto &Dep : SU->Preds) {
3287 // For instruction that requires multiple iterations, make sure that
3288 // the dependent instruction is not scheduled past the definition.
3289 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3290 !SU->isPred(I))
3291 *MinLateStart = std::min(*MinLateStart, cycle);
3292 }
3293 }
3294 }
3295}
3296
3297/// Order the instructions within a cycle so that the definitions occur
3298/// before the uses. Returns true if the instruction is added to the start
3299/// of the list, or false if added to the end.
3301 std::deque<SUnit *> &Insts) const {
3302 MachineInstr *MI = SU->getInstr();
3303 bool OrderBeforeUse = false;
3304 bool OrderAfterDef = false;
3305 bool OrderBeforeDef = false;
3306 unsigned MoveDef = 0;
3307 unsigned MoveUse = 0;
3308 int StageInst1 = stageScheduled(SU);
3309 const SwingSchedulerDDG *DDG = SSD->getDDG();
3310
3311 unsigned Pos = 0;
3312 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3313 ++I, ++Pos) {
3314 for (MachineOperand &MO : MI->operands()) {
3315 if (!MO.isReg() || !MO.getReg().isVirtual())
3316 continue;
3317
3318 Register Reg = MO.getReg();
3319 unsigned BasePos, OffsetPos;
3320 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3321 if (MI->getOperand(BasePos).getReg() == Reg)
3322 if (Register NewReg = SSD->getInstrBaseReg(SU))
3323 Reg = NewReg;
3324 bool Reads, Writes;
3325 std::tie(Reads, Writes) =
3326 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3327 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3328 OrderBeforeUse = true;
3329 if (MoveUse == 0)
3330 MoveUse = Pos;
3331 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3332 // Add the instruction after the scheduled instruction.
3333 OrderAfterDef = true;
3334 MoveDef = Pos;
3335 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3336 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3337 OrderBeforeUse = true;
3338 if (MoveUse == 0)
3339 MoveUse = Pos;
3340 } else {
3341 OrderAfterDef = true;
3342 MoveDef = Pos;
3343 }
3344 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3345 OrderBeforeUse = true;
3346 if (MoveUse == 0)
3347 MoveUse = Pos;
3348 if (MoveUse != 0) {
3349 OrderAfterDef = true;
3350 MoveDef = Pos - 1;
3351 }
3352 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3353 // Add the instruction before the scheduled instruction.
3354 OrderBeforeUse = true;
3355 if (MoveUse == 0)
3356 MoveUse = Pos;
3357 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3358 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3359 if (MoveUse == 0) {
3360 OrderBeforeDef = true;
3361 MoveUse = Pos;
3362 }
3363 }
3364 }
3365 // Check for order dependences between instructions. Make sure the source
3366 // is ordered before the destination.
3367 for (auto &OE : DDG->getOutEdges(SU)) {
3368 if (OE.getDst() != *I)
3369 continue;
3370 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3371 OrderBeforeUse = true;
3372 if (Pos < MoveUse)
3373 MoveUse = Pos;
3374 }
3375 // We did not handle HW dependences in previous for loop,
3376 // and we normally set Latency = 0 for Anti/Output deps,
3377 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3378 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3379 stageScheduled(*I) == StageInst1) {
3380 OrderBeforeUse = true;
3381 if ((MoveUse == 0) || (Pos < MoveUse))
3382 MoveUse = Pos;
3383 }
3384 }
3385 for (auto &IE : DDG->getInEdges(SU)) {
3386 if (IE.getSrc() != *I)
3387 continue;
3388 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3389 stageScheduled(*I) == StageInst1) {
3390 OrderAfterDef = true;
3391 MoveDef = Pos;
3392 }
3393 }
3394 }
3395
3396 // A circular dependence.
3397 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3398 OrderBeforeUse = false;
3399
3400 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3401 // to a loop-carried dependence.
3402 if (OrderBeforeDef)
3403 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3404
3405 // The uncommon case when the instruction order needs to be updated because
3406 // there is both a use and def.
3407 if (OrderBeforeUse && OrderAfterDef) {
3408 SUnit *UseSU = Insts.at(MoveUse);
3409 SUnit *DefSU = Insts.at(MoveDef);
3410 if (MoveUse > MoveDef) {
3411 Insts.erase(Insts.begin() + MoveUse);
3412 Insts.erase(Insts.begin() + MoveDef);
3413 } else {
3414 Insts.erase(Insts.begin() + MoveDef);
3415 Insts.erase(Insts.begin() + MoveUse);
3416 }
3417 orderDependence(SSD, UseSU, Insts);
3418 orderDependence(SSD, SU, Insts);
3419 orderDependence(SSD, DefSU, Insts);
3420 return;
3421 }
3422 // Put the new instruction first if there is a use in the list. Otherwise,
3423 // put it at the end of the list.
3424 if (OrderBeforeUse)
3425 Insts.push_front(SU);
3426 else
3427 Insts.push_back(SU);
3428}
3429
3430/// Return true if the scheduled Phi has a loop carried operand.
3432 MachineInstr &Phi) const {
3433 if (!Phi.isPHI())
3434 return false;
3435 assert(Phi.isPHI() && "Expecting a Phi.");
3436 SUnit *DefSU = SSD->getSUnit(&Phi);
3437 unsigned DefCycle = cycleScheduled(DefSU);
3438 int DefStage = stageScheduled(DefSU);
3439
3440 Register InitVal;
3441 Register LoopVal;
3442 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3443 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3444 if (!UseSU)
3445 return true;
3446 if (UseSU->getInstr()->isPHI())
3447 return true;
3448 unsigned LoopCycle = cycleScheduled(UseSU);
3449 int LoopStage = stageScheduled(UseSU);
3450 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3451}
3452
3453/// Return true if the instruction is a definition that is loop carried
3454/// and defines the use on the next iteration.
3455/// v1 = phi(v2, v3)
3456/// (Def) v3 = op v1
3457/// (MO) = v1
3458/// If MO appears before Def, then v1 and v3 may get assigned to the same
3459/// register.
3461 MachineInstr *Def,
3462 MachineOperand &MO) const {
3463 if (!MO.isReg())
3464 return false;
3465 if (Def->isPHI())
3466 return false;
3467 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3468 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3469 return false;
3470 if (!isLoopCarried(SSD, *Phi))
3471 return false;
3472 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3473 for (MachineOperand &DMO : Def->all_defs()) {
3474 if (DMO.getReg() == LoopReg)
3475 return true;
3476 }
3477 return false;
3478}
3479
3480/// Return true if all scheduled predecessors are loop-carried output/order
3481/// dependencies.
3483 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3484 for (const auto &IE : DDG->getInEdges(SU))
3485 if (InstrToCycle.count(IE.getSrc()))
3486 return false;
3487 return true;
3488}
3489
3490/// Determine transitive dependences of unpipelineable instructions
3493 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3494 SmallVector<SUnit *, 8> Worklist;
3495
3496 for (auto &SU : SSD->SUnits)
3497 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3498 Worklist.push_back(&SU);
3499
3500 const SwingSchedulerDDG *DDG = SSD->getDDG();
3501 while (!Worklist.empty()) {
3502 auto SU = Worklist.pop_back_val();
3503 if (DoNotPipeline.count(SU))
3504 continue;
3505 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3506 DoNotPipeline.insert(SU);
3507 for (const auto &IE : DDG->getInEdges(SU))
3508 Worklist.push_back(IE.getSrc());
3509
3510 // To preserve previous behavior and prevent regression
3511 // FIXME: Remove if this doesn't have significant impact on
3512 for (const auto &OE : DDG->getOutEdges(SU))
3513 if (OE.getDistance() == 1)
3514 Worklist.push_back(OE.getDst());
3515 }
3516 return DoNotPipeline;
3517}
3518
3519// Determine all instructions upon which any unpipelineable instruction depends
3520// and ensure that they are in stage 0. If unable to do so, return false.
3524
3525 int NewLastCycle = INT_MIN;
3526 for (SUnit &SU : SSD->SUnits) {
3527 if (!SU.isInstr())
3528 continue;
3529 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3530 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3531 continue;
3532 }
3533
3534 // Put the non-pipelined instruction as early as possible in the schedule
3535 int NewCycle = getFirstCycle();
3536 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3537 if (IE.getDistance() == 0)
3538 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3539
3540 // To preserve previous behavior and prevent regression
3541 // FIXME: Remove if this doesn't have significant impact on performance
3542 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3543 if (OE.getDistance() == 1)
3544 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3545
3546 int OldCycle = InstrToCycle[&SU];
3547 if (OldCycle != NewCycle) {
3548 InstrToCycle[&SU] = NewCycle;
3549 auto &OldS = getInstructions(OldCycle);
3550 llvm::erase(OldS, &SU);
3551 getInstructions(NewCycle).emplace_back(&SU);
3552 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3553 << ") is not pipelined; moving from cycle " << OldCycle
3554 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3555 }
3556
3557 // We traverse the SUs in the order of the original basic block. Computing
3558 // NewCycle in this order normally works fine because all dependencies
3559 // (except for loop-carried dependencies) don't violate the original order.
3560 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3561 // break it. That is, there may be exist an artificial dependency from
3562 // bottom to top. In such a case, NewCycle may become too large to be
3563 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3564 // following case:
3565 //
3566 // | Inst0 <-+
3567 // SU order | | artificial dep
3568 // | Inst1 --+
3569 // v
3570 //
3571 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3572 // Inst0 must be greater than or equal to N so that Inst0 is not be
3573 // scheduled at Stage 0. In such cases, we reject this schedule at this
3574 // time.
3575 // FIXME: The reason for this is the existence of artificial dependencies
3576 // that are contradict to the original SU order. If ignoring artificial
3577 // dependencies does not affect correctness, then it is better to ignore
3578 // them.
3579 if (FirstCycle + InitiationInterval <= NewCycle)
3580 return false;
3581
3582 NewLastCycle = std::max(NewLastCycle, NewCycle);
3583 }
3584 LastCycle = NewLastCycle;
3585 return true;
3586}
3587
3588// Check if the generated schedule is valid. This function checks if
3589// an instruction that uses a physical register is scheduled in a
3590// different stage than the definition. The pipeliner does not handle
3591// physical register values that may cross a basic block boundary.
3592// Furthermore, if a physical def/use pair is assigned to the same
3593// cycle, orderDependence does not guarantee def/use ordering, so that
3594// case should be considered invalid. (The test checks for both
3595// earlier and same-cycle use to be more robust.)
3597 for (SUnit &SU : SSD->SUnits) {
3598 if (!SU.hasPhysRegDefs)
3599 continue;
3600 int StageDef = stageScheduled(&SU);
3601 int CycleDef = InstrToCycle[&SU];
3602 assert(StageDef != -1 && "Instruction should have been scheduled.");
3603 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3604 SUnit *Dst = OE.getDst();
3605 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3606 if (OE.getReg().isPhysical()) {
3607 if (stageScheduled(Dst) != StageDef)
3608 return false;
3609 if (InstrToCycle[Dst] <= CycleDef)
3610 return false;
3611 }
3612 }
3613 }
3614 return true;
3615}
3616
3617/// A property of the node order in swing-modulo-scheduling is
3618/// that for nodes outside circuits the following holds:
3619/// none of them is scheduled after both a successor and a
3620/// predecessor.
3621/// The method below checks whether the property is met.
3622/// If not, debug information is printed and statistics information updated.
3623/// Note that we do not use an assert statement.
3624/// The reason is that although an invalid node order may prevent
3625/// the pipeliner from finding a pipelined schedule for arbitrary II,
3626/// it does not lead to the generation of incorrect code.
3627void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3628
3629 // a sorted vector that maps each SUnit to its index in the NodeOrder
3630 typedef std::pair<SUnit *, unsigned> UnitIndex;
3631 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3632
3633 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3634 Indices.push_back(std::make_pair(NodeOrder[i], i));
3635
3636 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3637 return std::get<0>(i1) < std::get<0>(i2);
3638 };
3639
3640 // sort, so that we can perform a binary search
3641 llvm::sort(Indices, CompareKey);
3642
3643 bool Valid = true;
3644 (void)Valid;
3645 // for each SUnit in the NodeOrder, check whether
3646 // it appears after both a successor and a predecessor
3647 // of the SUnit. If this is the case, and the SUnit
3648 // is not part of circuit, then the NodeOrder is not
3649 // valid.
3650 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3651 SUnit *SU = NodeOrder[i];
3652 unsigned Index = i;
3653
3654 bool PredBefore = false;
3655 bool SuccBefore = false;
3656
3657 SUnit *Succ;
3658 SUnit *Pred;
3659 (void)Succ;
3660 (void)Pred;
3661
3662 for (const auto &IE : DDG->getInEdges(SU)) {
3663 SUnit *PredSU = IE.getSrc();
3664 unsigned PredIndex = std::get<1>(
3665 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3666 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3667 PredBefore = true;
3668 Pred = PredSU;
3669 break;
3670 }
3671 }
3672
3673 for (const auto &OE : DDG->getOutEdges(SU)) {
3674 SUnit *SuccSU = OE.getDst();
3675 // Do not process a boundary node, it was not included in NodeOrder,
3676 // hence not in Indices either, call to std::lower_bound() below will
3677 // return Indices.end().
3678 if (SuccSU->isBoundaryNode())
3679 continue;
3680 unsigned SuccIndex = std::get<1>(
3681 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3682 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3683 SuccBefore = true;
3684 Succ = SuccSU;
3685 break;
3686 }
3687 }
3688
3689 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3690 // instructions in circuits are allowed to be scheduled
3691 // after both a successor and predecessor.
3692 bool InCircuit = llvm::any_of(
3693 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3694 if (InCircuit)
3695 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3696 else {
3697 Valid = false;
3698 NumNodeOrderIssues++;
3699 LLVM_DEBUG(dbgs() << "Predecessor ");
3700 }
3701 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3702 << " are scheduled before node " << SU->NodeNum
3703 << "\n");
3704 }
3705 }
3706
3707 LLVM_DEBUG({
3708 if (!Valid)
3709 dbgs() << "Invalid node order found!\n";
3710 });
3711}
3712
3713/// Attempt to fix the degenerate cases when the instruction serialization
3714/// causes the register lifetimes to overlap. For example,
3715/// p' = store_pi(p, b)
3716/// = load p, offset
3717/// In this case p and p' overlap, which means that two registers are needed.
3718/// Instead, this function changes the load to use p' and updates the offset.
3719void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3720 Register OverlapReg;
3721 Register NewBaseReg;
3722 for (SUnit *SU : Instrs) {
3723 MachineInstr *MI = SU->getInstr();
3724 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3725 const MachineOperand &MO = MI->getOperand(i);
3726 // Look for an instruction that uses p. The instruction occurs in the
3727 // same cycle but occurs later in the serialized order.
3728 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3729 // Check that the instruction appears in the InstrChanges structure,
3730 // which contains instructions that can have the offset updated.
3732 InstrChanges.find(SU);
3733 if (It != InstrChanges.end()) {
3734 unsigned BasePos, OffsetPos;
3735 // Update the base register and adjust the offset.
3736 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3737 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3738 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3739 int64_t NewOffset =
3740 MI->getOperand(OffsetPos).getImm() - It->second.second;
3741 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3742 SU->setInstr(NewMI);
3743 MISUnitMap[NewMI] = SU;
3744 NewMIs[MI] = NewMI;
3745 }
3746 }
3747 OverlapReg = Register();
3748 NewBaseReg = Register();
3749 break;
3750 }
3751 // Look for an instruction of the form p' = op(p), which uses and defines
3752 // two virtual registers that get allocated to the same physical register.
3753 unsigned TiedUseIdx = 0;
3754 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3755 // OverlapReg is p in the example above.
3756 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3757 // NewBaseReg is p' in the example above.
3758 NewBaseReg = MI->getOperand(i).getReg();
3759 break;
3760 }
3761 }
3762 }
3763}
3764
3765std::deque<SUnit *>
3767 const std::deque<SUnit *> &Instrs) const {
3768 std::deque<SUnit *> NewOrderPhi;
3769 for (SUnit *SU : Instrs) {
3770 if (SU->getInstr()->isPHI())
3771 NewOrderPhi.push_back(SU);
3772 }
3773 std::deque<SUnit *> NewOrderI;
3774 for (SUnit *SU : Instrs) {
3775 if (!SU->getInstr()->isPHI())
3776 orderDependence(SSD, SU, NewOrderI);
3777 }
3778 llvm::append_range(NewOrderPhi, NewOrderI);
3779 return NewOrderPhi;
3780}
3781
3782/// After the schedule has been formed, call this function to combine
3783/// the instructions from the different stages/cycles. That is, this
3784/// function creates a schedule that represents a single iteration.
3786 // Move all instructions to the first stage from later stages.
3787 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3788 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3789 ++stage) {
3790 std::deque<SUnit *> &cycleInstrs =
3791 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3792 for (SUnit *SU : llvm::reverse(cycleInstrs))
3793 ScheduledInstrs[cycle].push_front(SU);
3794 }
3795 }
3796
3797 // Erase all the elements in the later stages. Only one iteration should
3798 // remain in the scheduled list, and it contains all the instructions.
3799 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3800 ScheduledInstrs.erase(cycle);
3801
3802 // Change the registers in instruction as specified in the InstrChanges
3803 // map. We need to use the new registers to create the correct order.
3804 for (const SUnit &SU : SSD->SUnits)
3805 SSD->applyInstrChange(SU.getInstr(), *this);
3806
3807 // Reorder the instructions in each cycle to fix and improve the
3808 // generated code.
3809 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3810 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3811 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3812 SSD->fixupRegisterOverlaps(cycleInstrs);
3813 }
3814
3815 LLVM_DEBUG(dump(););
3816}
3817
3819 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3820 << " depth " << MaxDepth << " col " << Colocate << "\n";
3821 for (const auto &I : Nodes)
3822 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3823 os << "\n";
3824}
3825
3826#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3827/// Print the schedule information to the given output.
3829 // Iterate over each cycle.
3830 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3831 // Iterate over each instruction in the cycle.
3832 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3833 for (SUnit *CI : cycleInstrs->second) {
3834 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3835 os << "(" << CI->NodeNum << ") ";
3836 CI->getInstr()->print(os);
3837 os << "\n";
3838 }
3839 }
3840}
3841
3842/// Utility function used for debugging to print the schedule.
3845
3846void ResourceManager::dumpMRT() const {
3847 LLVM_DEBUG({
3848 if (UseDFA)
3849 return;
3850 std::stringstream SS;
3851 SS << "MRT:\n";
3852 SS << std::setw(4) << "Slot";
3853 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3854 SS << std::setw(3) << I;
3855 SS << std::setw(7) << "#Mops"
3856 << "\n";
3857 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3858 SS << std::setw(4) << Slot;
3859 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3860 SS << std::setw(3) << MRT[Slot][I];
3861 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3862 }
3863 dbgs() << SS.str();
3864 });
3865}
3866#endif
3867
3869 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3870 unsigned ProcResourceID = 0;
3871
3872 // We currently limit the resource kinds to 64 and below so that we can use
3873 // uint64_t for Masks
3874 assert(SM.getNumProcResourceKinds() < 64 &&
3875 "Too many kinds of resources, unsupported");
3876 // Create a unique bitmask for every processor resource unit.
3877 // Skip resource at index 0, since it always references 'InvalidUnit'.
3878 Masks.resize(SM.getNumProcResourceKinds());
3879 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3880 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3881 if (Desc.SubUnitsIdxBegin)
3882 continue;
3883 Masks[I] = 1ULL << ProcResourceID;
3884 ProcResourceID++;
3885 }
3886 // Create a unique bitmask for every processor resource group.
3887 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3888 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3889 if (!Desc.SubUnitsIdxBegin)
3890 continue;
3891 Masks[I] = 1ULL << ProcResourceID;
3892 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3893 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3894 ProcResourceID++;
3895 }
3896 LLVM_DEBUG({
3897 if (SwpShowResMask) {
3898 dbgs() << "ProcResourceDesc:\n";
3899 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3900 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3901 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3902 ProcResource->Name, I, Masks[I],
3903 ProcResource->NumUnits);
3904 }
3905 dbgs() << " -----------------\n";
3906 }
3907 });
3908}
3909
3911 LLVM_DEBUG({
3912 if (SwpDebugResource)
3913 dbgs() << "canReserveResources:\n";
3914 });
3915 if (UseDFA)
3916 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3917 ->canReserveResources(&SU.getInstr()->getDesc());
3918
3919 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3920 if (!SCDesc->isValid()) {
3921 LLVM_DEBUG({
3922 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3923 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3924 });
3925 return true;
3926 }
3927
3928 reserveResources(SCDesc, Cycle);
3929 bool Result = !isOverbooked();
3930 unreserveResources(SCDesc, Cycle);
3931
3932 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3933 return Result;
3934}
3935
3936void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3937 LLVM_DEBUG({
3938 if (SwpDebugResource)
3939 dbgs() << "reserveResources:\n";
3940 });
3941 if (UseDFA)
3942 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3943 ->reserveResources(&SU.getInstr()->getDesc());
3944
3945 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3946 if (!SCDesc->isValid()) {
3947 LLVM_DEBUG({
3948 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3949 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3950 });
3951 return;
3952 }
3953
3954 reserveResources(SCDesc, Cycle);
3955
3956 LLVM_DEBUG({
3957 if (SwpDebugResource) {
3958 dumpMRT();
3959 dbgs() << "reserveResources: done!\n\n";
3960 }
3961 });
3962}
3963
3964void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3965 int Cycle) {
3966 assert(!UseDFA);
3967 for (const MCWriteProcResEntry &PRE : make_range(
3968 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3969 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3970 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3971
3972 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3973 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3974}
3975
3976void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3977 int Cycle) {
3978 assert(!UseDFA);
3979 for (const MCWriteProcResEntry &PRE : make_range(
3980 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3981 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3982 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3983
3984 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3985 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3986}
3987
3988bool ResourceManager::isOverbooked() const {
3989 assert(!UseDFA);
3990 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3991 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3992 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3993 if (MRT[Slot][I] > Desc->NumUnits)
3994 return true;
3995 }
3996 if (NumScheduledMops[Slot] > IssueWidth)
3997 return true;
3998 }
3999 return false;
4000}
4001
4002int ResourceManager::calculateResMIIDFA() const {
4003 assert(UseDFA);
4004
4005 // Sort the instructions by the number of available choices for scheduling,
4006 // least to most. Use the number of critical resources as the tie breaker.
4007 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4008 for (SUnit &SU : DAG->SUnits)
4009 FUS.calcCriticalResources(*SU.getInstr());
4010 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4011 FuncUnitOrder(FUS);
4012
4013 for (SUnit &SU : DAG->SUnits)
4014 FuncUnitOrder.push(SU.getInstr());
4015
4017 Resources.push_back(
4018 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4019
4020 while (!FuncUnitOrder.empty()) {
4021 MachineInstr *MI = FuncUnitOrder.top();
4022 FuncUnitOrder.pop();
4023 if (TII->isZeroCost(MI->getOpcode()))
4024 continue;
4025
4026 // Attempt to reserve the instruction in an existing DFA. At least one
4027 // DFA is needed for each cycle.
4028 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4029 unsigned ReservedCycles = 0;
4030 auto *RI = Resources.begin();
4031 auto *RE = Resources.end();
4032 LLVM_DEBUG({
4033 dbgs() << "Trying to reserve resource for " << NumCycles
4034 << " cycles for \n";
4035 MI->dump();
4036 });
4037 for (unsigned C = 0; C < NumCycles; ++C)
4038 while (RI != RE) {
4039 if ((*RI)->canReserveResources(*MI)) {
4040 (*RI)->reserveResources(*MI);
4041 ++ReservedCycles;
4042 break;
4043 }
4044 RI++;
4045 }
4046 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4047 << ", NumCycles:" << NumCycles << "\n");
4048 // Add new DFAs, if needed, to reserve resources.
4049 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4051 << "NewResource created to reserve resources"
4052 << "\n");
4053 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4054 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4055 NewResource->reserveResources(*MI);
4056 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4057 }
4058 }
4059
4060 int Resmii = Resources.size();
4061 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4062 return Resmii;
4063}
4064
4066 if (UseDFA)
4067 return calculateResMIIDFA();
4068
4069 // Count each resource consumption and divide it by the number of units.
4070 // ResMII is the max value among them.
4071
4072 int NumMops = 0;
4073 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4074 for (SUnit &SU : DAG->SUnits) {
4075 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4076 continue;
4077
4078 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4079 if (!SCDesc->isValid())
4080 continue;
4081
4082 LLVM_DEBUG({
4083 if (SwpDebugResource) {
4084 DAG->dumpNode(SU);
4085 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4086 << " WriteProcRes: ";
4087 }
4088 });
4089 NumMops += SCDesc->NumMicroOps;
4090 for (const MCWriteProcResEntry &PRE :
4091 make_range(STI->getWriteProcResBegin(SCDesc),
4092 STI->getWriteProcResEnd(SCDesc))) {
4093 LLVM_DEBUG({
4094 if (SwpDebugResource) {
4095 const MCProcResourceDesc *Desc =
4096 SM.getProcResource(PRE.ProcResourceIdx);
4097 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4098 }
4099 });
4100 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4101 }
4102 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4103 }
4104
4105 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4106 LLVM_DEBUG({
4107 if (SwpDebugResource)
4108 dbgs() << "#Mops: " << NumMops << ", "
4109 << "IssueWidth: " << IssueWidth << ", "
4110 << "Cycles: " << Result << "\n";
4111 });
4112
4113 LLVM_DEBUG({
4114 if (SwpDebugResource) {
4115 std::stringstream SS;
4116 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4117 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4118 << "\n";
4119 dbgs() << SS.str();
4120 }
4121 });
4122 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4123 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4124 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4125 LLVM_DEBUG({
4126 if (SwpDebugResource) {
4127 std::stringstream SS;
4128 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4129 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4130 << std::setw(10) << Cycles << "\n";
4131 dbgs() << SS.str();
4132 }
4133 });
4134 if (Cycles > Result)
4135 Result = Cycles;
4136 }
4137 return Result;
4138}
4139
4141 InitiationInterval = II;
4142 DFAResources.clear();
4143 DFAResources.resize(II);
4144 for (auto &I : DFAResources)
4145 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4146 MRT.clear();
4147 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4148 NumScheduledMops.clear();
4149 NumScheduledMops.resize(II);
4150}
4151
4152bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4153 if (Pred.isArtificial() || Dst->isBoundaryNode())
4154 return true;
4155 // Currently, dependence that is an anti-dependences but not a loop-carried is
4156 // also ignored. This behavior is preserved to prevent regression.
4157 // FIXME: Remove if this doesn't have significant impact on performance
4158 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4159}
4160
4161SwingSchedulerDDG::SwingSchedulerDDGEdges &
4162SwingSchedulerDDG::getEdges(const SUnit *SU) {
4163 if (SU == EntrySU)
4164 return EntrySUEdges;
4165 if (SU == ExitSU)
4166 return ExitSUEdges;
4167 return EdgesVec[SU->NodeNum];
4168}
4169
4170const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4171SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4172 if (SU == EntrySU)
4173 return EntrySUEdges;
4174 if (SU == ExitSU)
4175 return ExitSUEdges;
4176 return EdgesVec[SU->NodeNum];
4177}
4178
4179void SwingSchedulerDDG::addEdge(const SUnit *SU,
4180 const SwingSchedulerDDGEdge &Edge) {
4181 assert(!Edge.isValidationOnly() &&
4182 "Validation-only edges are not expected here.");
4183 auto &Edges = getEdges(SU);
4184 if (Edge.getSrc() == SU)
4185 Edges.Succs.push_back(Edge);
4186 else
4187 Edges.Preds.push_back(Edge);
4188}
4189
4190void SwingSchedulerDDG::initEdges(SUnit *SU) {
4191 for (const auto &PI : SU->Preds) {
4192 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4193 /*IsValidationOnly=*/false);
4194 addEdge(SU, Edge);
4195 }
4196
4197 for (const auto &SI : SU->Succs) {
4198 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4199 /*IsValidationOnly=*/false);
4200 addEdge(SU, Edge);
4201 }
4202}
4203
4204SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4205 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4206 : EntrySU(EntrySU), ExitSU(ExitSU) {
4207 EdgesVec.resize(SUnits.size());
4208
4209 // Add non-loop-carried edges based on the DAG.
4210 initEdges(EntrySU);
4211 initEdges(ExitSU);
4212 for (auto &SU : SUnits)
4213 initEdges(&SU);
4214
4215 // Add loop-carried edges, which are not represented in the DAG.
4216 for (SUnit &SU : SUnits) {
4217 SUnit *Src = &SU;
4218 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4219 SDep Base(Src, SDep::Barrier);
4220 Base.setLatency(1);
4221 for (SUnit *Dst : *OD) {
4222 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4223 /*IsValidationOnly=*/true);
4224 Edge.setDistance(1);
4225 ValidationOnlyEdges.push_back(Edge);
4226 }
4227 }
4228 }
4229}
4230
4231const SwingSchedulerDDG::EdgesType &
4233 return getEdges(SU).Preds;
4234}
4235
4236const SwingSchedulerDDG::EdgesType &
4238 return getEdges(SU).Succs;
4239}
4240
4241/// Check if \p Schedule doesn't violate the validation-only dependencies.
4243 unsigned II = Schedule.getInitiationInterval();
4244
4245 auto ExpandCycle = [&](SUnit *SU) {
4246 int Stage = Schedule.stageScheduled(SU);
4247 int Cycle = Schedule.cycleScheduled(SU);
4248 return Cycle + (Stage * II);
4249 };
4250
4251 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4252 SUnit *Src = Edge.getSrc();
4253 SUnit *Dst = Edge.getDst();
4254 if (!Src->isInstr() || !Dst->isInstr())
4255 continue;
4256 int CycleSrc = ExpandCycle(Src);
4257 int CycleDst = ExpandCycle(Dst);
4258 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4259 if (CycleSrc > MaxLateStart) {
4260 LLVM_DEBUG({
4261 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4262 << Dst->NodeNum << "\n";
4263 });
4264 return false;
4265 }
4266 }
4267 return true;
4268}
4269
4270void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4271 const TargetInstrInfo *TII) {
4272 for (SUnit &SU : SUnits) {
4273 SUnit *Src = &SU;
4274 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4275 SDep Dep(Src, SDep::Barrier);
4276 Dep.setLatency(1);
4277 for (SUnit *Dst : *OrderDep) {
4278 SUnit *From = Src;
4279 SUnit *To = Dst;
4280 if (From->NodeNum > To->NodeNum)
4281 std::swap(From, To);
4282
4283 // Add a forward edge if the following conditions are met:
4284 //
4285 // - The instruction of the source node (FromMI) may read memory.
4286 // - The instruction of the target node (ToMI) may modify memory, but
4287 // does not read it.
4288 // - Neither instruction is a global barrier.
4289 // - The load appears before the store in the original basic block.
4290 // - There are no barrier or store instructions between the two nodes.
4291 // - The target node is unreachable from the source node in the current
4292 // DAG.
4293 //
4294 // TODO: These conditions are inherited from a previous implementation,
4295 // and some may no longer be necessary. For now, we conservatively
4296 // retain all of them to avoid regressions, but the logic could
4297 // potentially be simplified
4298 MachineInstr *FromMI = From->getInstr();
4299 MachineInstr *ToMI = To->getInstr();
4300 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4301 !TII->isGlobalMemoryObject(FromMI) &&
4302 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4303 SDep Pred = Dep;
4304 Pred.setSUnit(From);
4305 To->addPred(Pred);
4306 }
4307 }
4308 }
4309 }
4310}
4311
4313 const MachineRegisterInfo *MRI) const {
4314 const auto *Order = getOrderDepOrNull(SU);
4315
4316 if (!Order)
4317 return;
4318
4319 const auto DumpSU = [](const SUnit *SU) {
4320 std::ostringstream OSS;
4321 OSS << "SU(" << SU->NodeNum << ")";
4322 return OSS.str();
4323 };
4324
4325 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4326 << " Order\n";
4327 for (SUnit *Dst : *Order)
4328 dbgs() << " " << DumpSU(Dst) << "\n";
4329}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:270
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool erase(const KeyT &Val)
Definition DenseMap.h:311
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:618
MachineInstrBundleIterator< const MachineInstr > const_iterator
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
void insert_range(Range &&R)
Definition SetVector.h:175
typename vector_type::const_iterator iterator
Definition SetVector.h:71
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
constexpr double e
Definition MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
Op::Description Desc
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.