LLVM 23.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
115
116/// A command line option to turn software pipelining on or off.
117static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
118 cl::desc("Enable Software Pipelining"));
119
120/// A command line option to enable SWP at -Os.
121static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
122 cl::desc("Enable SWP at Os."), cl::Hidden,
123 cl::init(false));
124
125/// A command line argument to limit minimum initial interval for pipelining.
126static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
127 cl::desc("Size limit for the MII."),
128 cl::Hidden, cl::init(27));
129
130/// A command line argument to force pipeliner to use specified initial
131/// interval.
132static cl::opt<int> SwpForceII("pipeliner-force-ii",
133 cl::desc("Force pipeliner to use specified II."),
134 cl::Hidden, cl::init(-1));
135
136/// A command line argument to limit the number of stages in the pipeline.
137static cl::opt<int>
138 SwpMaxStages("pipeliner-max-stages",
139 cl::desc("Maximum stages allowed in the generated scheduled."),
140 cl::Hidden, cl::init(3));
141
142/// A command line option to disable the pruning of chain dependences due to
143/// an unrelated Phi.
144static cl::opt<bool>
145 SwpPruneDeps("pipeliner-prune-deps",
146 cl::desc("Prune dependences between unrelated Phi nodes."),
147 cl::Hidden, cl::init(true));
148
149/// A command line option to disable the pruning of loop carried order
150/// dependences.
151static cl::opt<bool>
152 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
153 cl::desc("Prune loop carried order dependences."),
154 cl::Hidden, cl::init(true));
155
156#ifndef NDEBUG
157static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
158#endif
159
160static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
162 cl::desc("Ignore RecMII"));
163
164static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
165 cl::init(false));
166static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
167 cl::init(false));
168
170 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
171 cl::desc("Instead of emitting the pipelined code, annotate instructions "
172 "with the generated schedule for feeding into the "
173 "-modulo-schedule-test pass"));
174
176 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
177 cl::desc(
178 "Use the experimental peeling code generator for software pipelining"));
179
180static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
181 cl::desc("Range to search for II"),
182 cl::Hidden, cl::init(10));
183
184static cl::opt<bool>
185 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
186 cl::desc("Limit register pressure of scheduled loop"));
187
188static cl::opt<int>
189 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
190 cl::init(5),
191 cl::desc("Margin representing the unused percentage of "
192 "the register pressure limit"));
193
194static cl::opt<bool>
195 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
196 cl::desc("Use the MVE code generator for software pipelining"));
197
198/// A command line argument to limit the number of store instructions in the
199/// target basic block.
201 "pipeliner-max-num-stores",
202 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
203 cl::init(200));
204
205// A command line option to enable the CopyToPhi DAG mutation.
207 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
208 cl::init(true),
209 cl::desc("Enable CopyToPhi DAG Mutation"));
210
211/// A command line argument to force pipeliner to use specified issue
212/// width.
214 "pipeliner-force-issue-width",
215 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
216 cl::init(-1));
217
218/// A command line argument to set the window scheduling option.
221 cl::desc("Set how to use window scheduling algorithm."),
223 "Turn off window algorithm."),
225 "Use window algorithm after SMS algorithm fails."),
227 "Use window algorithm instead of SMS algorithm.")));
228
229unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
230char MachinePipeliner::ID = 0;
231#ifndef NDEBUG
233#endif
235
237 "Modulo Software Pipelining", false, false)
243 "Modulo Software Pipelining", false, false)
244
245namespace {
246
247/// This class holds an SUnit corresponding to a memory operation and other
248/// information related to the instruction.
252
253 /// The value of a memory operand.
254 const Value *MemOpValue = nullptr;
255
256 /// The offset of a memory operand.
257 int64_t MemOpOffset = 0;
258
260
261 /// True if all the underlying objects are identified.
262 bool IsAllIdentified = false;
263
265
266 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
267
268 bool isUnknown() const { return MemOpValue == nullptr; }
269
270private:
272};
273
274/// Add loop-carried chain dependencies. This class handles the same type of
275/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
276/// account dependencies across iterations.
278 // Type of instruction that is relevant to order-dependencies
279 enum class InstrTag {
280 Barrier = 0, ///< A barrier event instruction.
281 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
282 ///< not a barrier event.
283 FPExceptions = 2, ///< An instruction that does not match above, but may
284 ///< raise floatin-point exceptions.
285 };
286
287 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
288 TaggedSUnit(SUnit *SU, InstrTag Tag)
289 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
290
291 InstrTag getTag() const { return InstrTag(getInt()); }
292 };
293
294 /// Holds instructions that may form loop-carried order-dependencies, but not
295 /// global barriers.
296 struct NoBarrierInstsChunk {
300
301 void append(SUnit *SU);
302 };
303
305 BatchAAResults *BAA;
306 std::vector<SUnit> &SUnits;
307
308 /// The size of SUnits, for convenience.
309 const unsigned N;
310
311 /// Loop-carried Edges.
312 std::vector<BitVector> LoopCarried;
313
314 /// Instructions related to chain dependencies. They are one of the
315 /// following:
316 ///
317 /// 1. Barrier event.
318 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
319 /// value.
320 /// 3. Store, but not a barrier event.
321 /// 4. None of them, but may raise floating-point exceptions.
322 ///
323 /// This is used when analyzing loop-carried dependencies that access global
324 /// barrier instructions.
325 std::vector<TaggedSUnit> TaggedSUnits;
326
327 const TargetInstrInfo *TII = nullptr;
328 const TargetRegisterInfo *TRI = nullptr;
329
330public:
332 const TargetInstrInfo *TII,
333 const TargetRegisterInfo *TRI);
334
335 /// The main function to compute loop-carried order-dependencies.
336 void computeDependencies();
337
338 const BitVector &getLoopCarried(unsigned Idx) const {
339 return LoopCarried[Idx];
340 }
341
342private:
343 /// Tags to \p SU if the instruction may affect the order-dependencies.
344 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
345
346 void addLoopCarriedDepenenciesForChunks(const NoBarrierInstsChunk &From,
347 const NoBarrierInstsChunk &To);
348
349 /// Add a loop-carried order dependency between \p Src and \p Dst if we
350 /// cannot prove they are independent.
351 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
352 const SUnitWithMemInfo &Dst);
353
354 void computeDependenciesAux();
355
356 void setLoopCarriedDep(const SUnit *Src, const SUnit *Dst) {
357 LoopCarried[Src->NodeNum].set(Dst->NodeNum);
358 }
359};
360
361} // end anonymous namespace
362
363/// The "main" function for implementing Swing Modulo Scheduling.
365 if (skipFunction(mf.getFunction()))
366 return false;
367
368 if (!EnableSWP)
369 return false;
370
371 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
372 !EnableSWPOptSize.getPosition())
373 return false;
374
376 return false;
377
378 // Cannot pipeline loops without instruction itineraries if we are using
379 // DFA for the pipeliner.
380 if (mf.getSubtarget().useDFAforSMS() &&
383 return false;
384
385 MF = &mf;
389 TII = MF->getSubtarget().getInstrInfo();
390 RegClassInfo.runOnMachineFunction(*MF);
391
392 for (const auto &L : *MLI)
393 scheduleLoop(*L);
394
395 return false;
396}
397
398/// Attempt to perform the SMS algorithm on the specified loop. This function is
399/// the main entry point for the algorithm. The function identifies candidate
400/// loops, calculates the minimum initiation interval, and attempts to schedule
401/// the loop.
402bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
403 bool Changed = false;
404 for (const auto &InnerLoop : L)
405 Changed |= scheduleLoop(*InnerLoop);
406
407#ifndef NDEBUG
408 // Stop trying after reaching the limit (if any).
409 int Limit = SwpLoopLimit;
410 if (Limit >= 0) {
411 if (NumTries >= SwpLoopLimit)
412 return Changed;
413 NumTries++;
414 }
415#endif
416
417 setPragmaPipelineOptions(L);
418 if (!canPipelineLoop(L)) {
419 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
420 ORE->emit([&]() {
421 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
422 L.getStartLoc(), L.getHeader())
423 << "Failed to pipeline loop";
424 });
425
426 LI.LoopPipelinerInfo.reset();
427 return Changed;
428 }
429
430 ++NumTrytoPipeline;
431 if (useSwingModuloScheduler())
432 Changed = swingModuloScheduler(L);
433
434 if (useWindowScheduler(Changed))
435 Changed = runWindowScheduler(L);
436
437 LI.LoopPipelinerInfo.reset();
438 return Changed;
439}
440
441void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
442 // Reset the pragma for the next loop in iteration.
443 disabledByPragma = false;
444 II_setByPragma = 0;
445
446 MachineBasicBlock *LBLK = L.getTopBlock();
447
448 if (LBLK == nullptr)
449 return;
450
451 const BasicBlock *BBLK = LBLK->getBasicBlock();
452 if (BBLK == nullptr)
453 return;
454
455 const Instruction *TI = BBLK->getTerminator();
456 if (TI == nullptr)
457 return;
458
459 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
460 if (LoopID == nullptr)
461 return;
462
463 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
464 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
465
466 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
467 MDNode *MD = dyn_cast<MDNode>(MDO);
468
469 if (MD == nullptr)
470 continue;
471
472 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
473
474 if (S == nullptr)
475 continue;
476
477 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
478 assert(MD->getNumOperands() == 2 &&
479 "Pipeline initiation interval hint metadata should have two operands.");
481 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
482 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
483 } else if (S->getString() == "llvm.loop.pipeline.disable") {
484 disabledByPragma = true;
485 }
486 }
487}
488
489/// Depth-first search to detect cycles among PHI dependencies.
490/// Returns true if a cycle is detected within the PHI-only subgraph.
491static bool hasPHICycleDFS(
492 unsigned Reg, const DenseMap<unsigned, SmallVector<unsigned, 2>> &PhiDeps,
493 SmallSet<unsigned, 8> &Visited, SmallSet<unsigned, 8> &RecStack) {
494
495 // If Reg is not a PHI-def it cannot contribute to a PHI cycle.
496 auto It = PhiDeps.find(Reg);
497 if (It == PhiDeps.end())
498 return false;
499
500 if (RecStack.count(Reg))
501 return true; // backedge.
502 if (Visited.count(Reg))
503 return false;
504
505 Visited.insert(Reg);
506 RecStack.insert(Reg);
507
508 for (unsigned Dep : It->second) {
509 if (hasPHICycleDFS(Dep, PhiDeps, Visited, RecStack))
510 return true;
511 }
512
513 RecStack.erase(Reg);
514 return false;
515}
516
517static bool hasPHICycle(const MachineBasicBlock *LoopHeader,
518 const MachineRegisterInfo &MRI) {
520
521 // Collect PHI nodes and their dependencies.
522 for (const MachineInstr &MI : LoopHeader->phis()) {
523 unsigned DefReg = MI.getOperand(0).getReg();
524 auto Ins = PhiDeps.try_emplace(DefReg).first;
525
526 // PHI operands are (Reg, MBB) pairs starting at index 1.
527 for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
528 Ins->second.push_back(MI.getOperand(I).getReg());
529 }
530
531 // DFS to detect cycles among PHI nodes.
532 SmallSet<unsigned, 8> Visited, RecStack;
533
534 // Start DFS from each PHI-def.
535 for (const auto &KV : PhiDeps) {
536 unsigned Reg = KV.first;
537 if (hasPHICycleDFS(Reg, PhiDeps, Visited, RecStack))
538 return true;
539 }
540
541 return false;
542}
543
544/// Return true if the loop can be software pipelined. The algorithm is
545/// restricted to loops with a single basic block. Make sure that the
546/// branch in the loop can be analyzed.
547bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
548 if (L.getNumBlocks() != 1) {
549 ORE->emit([&]() {
550 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
551 L.getStartLoc(), L.getHeader())
552 << "Not a single basic block: "
553 << ore::NV("NumBlocks", L.getNumBlocks());
554 });
555 return false;
556 }
557
558 if (hasPHICycle(L.getHeader(), MF->getRegInfo())) {
559 LLVM_DEBUG(dbgs() << "Cannot pipeline loop due to PHI cycle\n");
560 return false;
561 }
562
563 if (disabledByPragma) {
564 ORE->emit([&]() {
565 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
566 L.getStartLoc(), L.getHeader())
567 << "Disabled by Pragma.";
568 });
569 return false;
570 }
571
572 // Check if the branch can't be understood because we can't do pipelining
573 // if that's the case.
574 LI.TBB = nullptr;
575 LI.FBB = nullptr;
576 LI.BrCond.clear();
577 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
578 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
579 NumFailBranch++;
580 ORE->emit([&]() {
581 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
582 L.getStartLoc(), L.getHeader())
583 << "The branch can't be understood";
584 });
585 return false;
586 }
587
588 LI.LoopInductionVar = nullptr;
589 LI.LoopCompare = nullptr;
590 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
591 if (!LI.LoopPipelinerInfo) {
592 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
593 NumFailLoop++;
594 ORE->emit([&]() {
595 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
596 L.getStartLoc(), L.getHeader())
597 << "The loop structure is not supported";
598 });
599 return false;
600 }
601
602 if (!L.getLoopPreheader()) {
603 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
604 NumFailPreheader++;
605 ORE->emit([&]() {
606 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
607 L.getStartLoc(), L.getHeader())
608 << "No loop preheader found";
609 });
610 return false;
611 }
612
613 unsigned NumStores = 0;
614 for (MachineInstr &MI : *L.getHeader())
615 if (MI.mayStore())
616 ++NumStores;
617 if (NumStores > SwpMaxNumStores) {
618 LLVM_DEBUG(dbgs() << "Too many stores\n");
619 NumFailTooManyStores++;
620 ORE->emit([&]() {
621 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
622 L.getStartLoc(), L.getHeader())
623 << "Too many store instructions in the loop: "
624 << ore::NV("NumStores", NumStores) << " > "
625 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
626 });
627 return false;
628 }
629
630 // Remove any subregisters from inputs to phi nodes.
631 preprocessPhiNodes(*L.getHeader());
632 return true;
633}
634
635void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
636 MachineRegisterInfo &MRI = MF->getRegInfo();
637 SlotIndexes &Slots =
638 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
639
640 for (MachineInstr &PI : B.phis()) {
641 MachineOperand &DefOp = PI.getOperand(0);
642 assert(DefOp.getSubReg() == 0);
643 auto *RC = MRI.getRegClass(DefOp.getReg());
644
645 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
646 MachineOperand &RegOp = PI.getOperand(i);
647 if (RegOp.getSubReg() == 0)
648 continue;
649
650 // If the operand uses a subregister, replace it with a new register
651 // without subregisters, and generate a copy to the new register.
652 Register NewReg = MRI.createVirtualRegister(RC);
653 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
655 const DebugLoc &DL = PredB.findDebugLoc(At);
656 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
657 .addReg(RegOp.getReg(), getRegState(RegOp),
658 RegOp.getSubReg());
659 Slots.insertMachineInstrInMaps(*Copy);
660 RegOp.setReg(NewReg);
661 RegOp.setSubReg(0);
662 }
663 }
664}
665
666/// The SMS algorithm consists of the following main steps:
667/// 1. Computation and analysis of the dependence graph.
668/// 2. Ordering of the nodes (instructions).
669/// 3. Attempt to Schedule the loop.
670bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
671 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
672
673 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
674 SwingSchedulerDAG SMS(
676 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
677
678 MachineBasicBlock *MBB = L.getHeader();
679 // The kernel should not include any terminator instructions. These
680 // will be added back later.
681 SMS.startBlock(MBB);
682
683 // Compute the number of 'real' instructions in the basic block by
684 // ignoring terminators.
685 unsigned size = MBB->size();
687 E = MBB->instr_end();
688 I != E; ++I, --size)
689 ;
690
691 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
692 SMS.schedule();
693 SMS.exitRegion();
694
695 SMS.finishBlock();
696 return SMS.hasNewSchedule();
697}
698
709
710bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
711 MachineSchedContext Context;
712 Context.MF = MF;
713 Context.MLI = MLI;
714 Context.MDT = MDT;
715 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
716 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
717 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
718 Context.RegClassInfo->runOnMachineFunction(*MF);
719 WindowScheduler WS(&Context, L);
720 return WS.run();
721}
722
723bool MachinePipeliner::useSwingModuloScheduler() {
724 // SwingModuloScheduler does not work when WindowScheduler is forced.
726}
727
728bool MachinePipeliner::useWindowScheduler(bool Changed) {
729 // WindowScheduler does not work for following cases:
730 // 1. when it is off.
731 // 2. when SwingModuloScheduler is successfully scheduled.
732 // 3. when pragma II is enabled.
733 if (II_setByPragma) {
734 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
735 "llvm.loop.pipeline.initiationinterval is set.\n");
736 return false;
737 }
738
741}
742
743void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
744 if (SwpForceII > 0)
745 MII = SwpForceII;
746 else if (II_setByPragma > 0)
747 MII = II_setByPragma;
748 else
749 MII = std::max(ResMII, RecMII);
750}
751
752void SwingSchedulerDAG::setMAX_II() {
753 if (SwpForceII > 0)
754 MAX_II = SwpForceII;
755 else if (II_setByPragma > 0)
756 MAX_II = II_setByPragma;
757 else
758 MAX_II = MII + SwpIISearchRange;
759}
760
761/// We override the schedule function in ScheduleDAGInstrs to implement the
762/// scheduling part of the Swing Modulo Scheduling algorithm.
764 buildSchedGraph(AA);
765 const LoopCarriedEdges LCE = addLoopCarriedDependences();
766 updatePhiDependences();
767 Topo.InitDAGTopologicalSorting();
768 changeDependences();
769 postProcessDAG();
770 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
771 LLVM_DEBUG({
772 dump();
773 dbgs() << "===== Loop Carried Edges Begin =====\n";
774 for (SUnit &SU : SUnits)
775 LCE.dump(&SU, TRI, &MRI);
776 dbgs() << "===== Loop Carried Edges End =====\n";
777 });
778
779 NodeSetType NodeSets;
780 findCircuits(NodeSets);
781 NodeSetType Circuits = NodeSets;
782
783 // Calculate the MII.
784 unsigned ResMII = calculateResMII();
785 unsigned RecMII = calculateRecMII(NodeSets);
786
787 fuseRecs(NodeSets);
788
789 // This flag is used for testing and can cause correctness problems.
790 if (SwpIgnoreRecMII)
791 RecMII = 0;
792
793 setMII(ResMII, RecMII);
794 setMAX_II();
795
796 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
797 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
798
799 // Can't schedule a loop without a valid MII.
800 if (MII == 0) {
801 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
802 NumFailZeroMII++;
803 Pass.ORE->emit([&]() {
805 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
806 << "Invalid Minimal Initiation Interval: 0";
807 });
808 return;
809 }
810
811 // Don't pipeline large loops.
812 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
813 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
814 << ", we don't pipeline large loops\n");
815 NumFailLargeMaxMII++;
816 Pass.ORE->emit([&]() {
818 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
819 << "Minimal Initiation Interval too large: "
820 << ore::NV("MII", (int)MII) << " > "
821 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
822 << "Refer to -pipeliner-max-mii.";
823 });
824 return;
825 }
826
827 computeNodeFunctions(NodeSets);
828
829 registerPressureFilter(NodeSets);
830
831 colocateNodeSets(NodeSets);
832
833 checkNodeSets(NodeSets);
834
835 LLVM_DEBUG({
836 for (auto &I : NodeSets) {
837 dbgs() << " Rec NodeSet ";
838 I.dump();
839 }
840 });
841
842 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
843
844 groupRemainingNodes(NodeSets);
845
846 removeDuplicateNodes(NodeSets);
847
848 LLVM_DEBUG({
849 for (auto &I : NodeSets) {
850 dbgs() << " NodeSet ";
851 I.dump();
852 }
853 });
854
855 computeNodeOrder(NodeSets);
856
857 // check for node order issues
858 checkValidNodeOrder(Circuits);
859
860 SMSchedule Schedule(Pass.MF, this);
861 Scheduled = schedulePipeline(Schedule);
862
863 if (!Scheduled){
864 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
865 NumFailNoSchedule++;
866 Pass.ORE->emit([&]() {
868 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
869 << "Unable to find schedule";
870 });
871 return;
872 }
873
874 unsigned numStages = Schedule.getMaxStageCount();
875 // No need to generate pipeline if there are no overlapped iterations.
876 if (numStages == 0) {
877 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
878 NumFailZeroStage++;
879 Pass.ORE->emit([&]() {
881 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
882 << "No need to pipeline - no overlapped iterations in schedule.";
883 });
884 return;
885 }
886 // Check that the maximum stage count is less than user-defined limit.
887 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
888 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
889 << " : too many stages, abort\n");
890 NumFailLargeMaxStage++;
891 Pass.ORE->emit([&]() {
893 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
894 << "Too many stages in schedule: "
895 << ore::NV("numStages", (int)numStages) << " > "
896 << ore::NV("SwpMaxStages", SwpMaxStages)
897 << ". Refer to -pipeliner-max-stages.";
898 });
899 return;
900 }
901
902 Pass.ORE->emit([&]() {
903 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
904 Loop.getHeader())
905 << "Pipelined succesfully!";
906 });
907
908 // Generate the schedule as a ModuloSchedule.
909 DenseMap<MachineInstr *, int> Cycles, Stages;
910 std::vector<MachineInstr *> OrderedInsts;
911 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
912 ++Cycle) {
913 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
914 OrderedInsts.push_back(SU->getInstr());
915 Cycles[SU->getInstr()] = Cycle;
916 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
917 }
918 }
920 for (auto &KV : NewMIs) {
921 Cycles[KV.first] = Cycles[KV.second];
922 Stages[KV.first] = Stages[KV.second];
923 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
924 }
925
926 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
927 std::move(Stages));
929 assert(NewInstrChanges.empty() &&
930 "Cannot serialize a schedule with InstrChanges!");
932 MSTI.annotate();
933 return;
934 }
935 // The experimental code generator can't work if there are InstChanges.
936 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
937 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
938 MSE.expand();
939 } else if (MVECodeGen && NewInstrChanges.empty() &&
940 LoopPipelinerInfo->isMVEExpanderSupported() &&
942 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
943 MSE.expand();
944 } else {
945 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
946 MSE.expand();
947 MSE.cleanup();
948 }
949 ++NumPipelined;
950}
951
952/// Clean up after the software pipeliner runs.
954 for (auto &KV : NewMIs)
955 MF.deleteMachineInstr(KV.second);
956 NewMIs.clear();
957
958 // Call the superclass.
960}
961
962/// Return the register values for the operands of a Phi instruction.
963/// This function assume the instruction is a Phi.
965 Register &InitVal, Register &LoopVal) {
966 assert(Phi.isPHI() && "Expecting a Phi.");
967
968 InitVal = Register();
969 LoopVal = Register();
970 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
971 if (Phi.getOperand(i + 1).getMBB() != Loop)
972 InitVal = Phi.getOperand(i).getReg();
973 else
974 LoopVal = Phi.getOperand(i).getReg();
975
976 assert(InitVal && LoopVal && "Unexpected Phi structure.");
977}
978
979/// Return the Phi register value that comes the loop block.
981 const MachineBasicBlock *LoopBB) {
982 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
983 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
984 return Phi.getOperand(i).getReg();
985 return Register();
986}
987
988/// Return true if SUb can be reached from SUa following the chain edges.
989static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
992 Worklist.push_back(SUa);
993 while (!Worklist.empty()) {
994 const SUnit *SU = Worklist.pop_back_val();
995 for (const auto &SI : SU->Succs) {
996 SUnit *SuccSU = SI.getSUnit();
997 if (SI.getKind() == SDep::Order) {
998 if (Visited.count(SuccSU))
999 continue;
1000 if (SuccSU == SUb)
1001 return true;
1002 Worklist.push_back(SuccSU);
1003 Visited.insert(SuccSU);
1004 }
1005 }
1006 }
1007 return false;
1008}
1009
1011 if (!getUnderlyingObjects())
1012 return;
1013 for (const Value *Obj : UnderlyingObjs)
1014 if (!isIdentifiedObject(Obj)) {
1015 IsAllIdentified = false;
1016 break;
1017 }
1018}
1019
1021 const SUnitWithMemInfo &Other) const {
1022 // If all underlying objects are identified objects and there is no overlap
1023 // between them, then these two instructions are disjoint.
1024 if (!IsAllIdentified || !Other.IsAllIdentified)
1025 return false;
1026 for (const Value *Obj : UnderlyingObjs)
1027 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
1028 return false;
1029 return true;
1030}
1031
1032/// Collect the underlying objects for the memory references of an instruction.
1033/// This function calls the code in ValueTracking, but first checks that the
1034/// instruction has a memory operand.
1035/// Returns false if we cannot find the underlying objects.
1036bool SUnitWithMemInfo::getUnderlyingObjects() {
1037 const MachineInstr *MI = SU->getInstr();
1038 if (!MI->hasOneMemOperand())
1039 return false;
1040 MachineMemOperand *MM = *MI->memoperands_begin();
1041 if (!MM->getValue())
1042 return false;
1043 MemOpValue = MM->getValue();
1044 MemOpOffset = MM->getOffset();
1046
1047 // TODO: A no alias scope may be valid only in a single iteration. In this
1048 // case we need to peel off it like LoopAccessAnalysis does.
1049 AATags = MM->getAAInfo();
1050 return true;
1051}
1052
1053/// Returns true if there is a loop-carried order dependency from \p Src to \p
1054/// Dst.
1055static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
1056 const SUnitWithMemInfo &Dst,
1057 BatchAAResults &BAA,
1058 const TargetInstrInfo *TII,
1059 const TargetRegisterInfo *TRI,
1060 const SwingSchedulerDAG *SSD) {
1061 if (Src.isTriviallyDisjoint(Dst))
1062 return false;
1063 if (isSuccOrder(Src.SU, Dst.SU))
1064 return false;
1065
1066 MachineInstr &SrcMI = *Src.SU->getInstr();
1067 MachineInstr &DstMI = *Dst.SU->getInstr();
1068
1069 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1070 return false;
1071
1072 // Second, the more expensive check that uses alias analysis on the
1073 // base registers. If they alias, and the load offset is less than
1074 // the store offset, the mark the dependence as loop carried.
1075 if (Src.isUnknown() || Dst.isUnknown())
1076 return true;
1077 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1078 return true;
1079
1080 if (BAA.isNoAlias(
1081 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1082 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1083 return false;
1084
1085 // AliasAnalysis sometimes gives up on following the underlying
1086 // object. In such a case, separate checks for underlying objects may
1087 // prove that there are no aliases between two accesses.
1088 for (const Value *SrcObj : Src.UnderlyingObjs)
1089 for (const Value *DstObj : Dst.UnderlyingObjs)
1090 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1091 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1092 return true;
1093
1094 return false;
1095}
1096
1097void LoopCarriedOrderDepsTracker::NoBarrierInstsChunk::append(SUnit *SU) {
1098 const MachineInstr *MI = SU->getInstr();
1099 if (MI->mayStore())
1100 Stores.emplace_back(SU);
1101 else if (MI->mayLoad())
1102 Loads.emplace_back(SU);
1103 else if (MI->mayRaiseFPException())
1104 FPExceptions.emplace_back(SU);
1105 else
1106 llvm_unreachable("Unexpected instruction type.");
1107}
1108
1110 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1111 const TargetRegisterInfo *TRI)
1112 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1113 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1114
1116 // Traverse all instructions and extract only what we are targetting.
1117 for (auto &SU : SUnits) {
1118 auto Tagged = getInstrTag(&SU);
1119
1120 // This instruction has no loop-carried order-dependencies.
1121 if (!Tagged)
1122 continue;
1123 TaggedSUnits.emplace_back(&SU, *Tagged);
1124 }
1125
1126 computeDependenciesAux();
1127}
1128
1129std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1130LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1131 MachineInstr *MI = SU->getInstr();
1132 if (TII->isGlobalMemoryObject(MI))
1133 return InstrTag::Barrier;
1134
1135 if (MI->mayStore() ||
1136 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1137 return InstrTag::LoadOrStore;
1138
1139 if (MI->mayRaiseFPException())
1140 return InstrTag::FPExceptions;
1141
1142 return std::nullopt;
1143}
1144
1145void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1146 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst) {
1147 // Avoid self-dependencies.
1148 if (Src.SU == Dst.SU)
1149 return;
1150
1151 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG))
1152 setLoopCarriedDep(Src.SU, Dst.SU);
1153}
1154
1155void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1156 const NoBarrierInstsChunk &From, const NoBarrierInstsChunk &To) {
1157 // Add load-to-store dependencies (WAR).
1158 for (const SUnitWithMemInfo &Src : From.Loads)
1159 for (const SUnitWithMemInfo &Dst : To.Stores)
1160 addDependenciesBetweenSUs(Src, Dst);
1161
1162 // Add store-to-load dependencies (RAW).
1163 for (const SUnitWithMemInfo &Src : From.Stores)
1164 for (const SUnitWithMemInfo &Dst : To.Loads)
1165 addDependenciesBetweenSUs(Src, Dst);
1166
1167 // Add store-to-store dependencies (WAW).
1168 for (const SUnitWithMemInfo &Src : From.Stores)
1169 for (const SUnitWithMemInfo &Dst : To.Stores)
1170 addDependenciesBetweenSUs(Src, Dst);
1171}
1172
1173void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1175 SUnit *FirstBarrier = nullptr;
1176 SUnit *LastBarrier = nullptr;
1177 for (const auto &TSU : TaggedSUnits) {
1178 InstrTag Tag = TSU.getTag();
1179 SUnit *SU = TSU.getPointer();
1180 switch (Tag) {
1181 case InstrTag::Barrier:
1182 if (!FirstBarrier)
1183 FirstBarrier = SU;
1184 LastBarrier = SU;
1185 Chunks.emplace_back();
1186 break;
1187 case InstrTag::LoadOrStore:
1188 case InstrTag::FPExceptions:
1189 Chunks.back().append(SU);
1190 break;
1191 }
1192 }
1193
1194 // Add dependencies between memory operations. If there are one or more
1195 // barrier events between two memory instructions, we don't add a
1196 // loop-carried dependence for them.
1197 for (const NoBarrierInstsChunk &Chunk : Chunks)
1198 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1199
1200 // There is no barrier instruction between load/store/fp-exception
1201 // instructions in the same chunk. If there are one or more barrier
1202 // instructions, the instructions sequence is as follows:
1203 //
1204 // Loads/Stores/FPExceptions (Chunks.front())
1205 // Barrier (FirstBarrier)
1206 // Loads/Stores/FPExceptions
1207 // Barrier
1208 // ...
1209 // Loads/Stores/FPExceptions
1210 // Barrier (LastBarrier)
1211 // Loads/Stores/FPExceptions (Chunks.back())
1212 //
1213 // Since loads/stores/fp-exceptions must not be reordered across barrier
1214 // instructions, and the order of barrier instructions must be preserved, add
1215 // the following loop-carried dependences:
1216 //
1217 // Loads/Stores/FPExceptions (Chunks.front()) <-----+
1218 // +--> Barrier (FirstBarrier) <----------------------+ |
1219 // | Loads/Stores/FPExceptions | |
1220 // | Barrier | |
1221 // | ... | |
1222 // | Loads/Stores/FPExceptions | |
1223 // | Barrier (LastBarrier) ------------------------+--+
1224 // +--- Loads/Stores/FPExceptions (Chunks.back())
1225 //
1226 if (FirstBarrier) {
1227 assert(LastBarrier && "Both barriers should be set.");
1228
1229 // LastBarrier -> Loads/Stores/FPExceptions in Chunks.front()
1230 for (const SUnitWithMemInfo &Dst : Chunks.front().Loads)
1231 setLoopCarriedDep(LastBarrier, Dst.SU);
1232 for (const SUnitWithMemInfo &Dst : Chunks.front().Stores)
1233 setLoopCarriedDep(LastBarrier, Dst.SU);
1234 for (const SUnitWithMemInfo &Dst : Chunks.front().FPExceptions)
1235 setLoopCarriedDep(LastBarrier, Dst.SU);
1236
1237 // Loads/Stores/FPExceptions in Chunks.back() -> FirstBarrier
1238 for (const SUnitWithMemInfo &Src : Chunks.back().Loads)
1239 setLoopCarriedDep(Src.SU, FirstBarrier);
1240 for (const SUnitWithMemInfo &Src : Chunks.back().Stores)
1241 setLoopCarriedDep(Src.SU, FirstBarrier);
1242 for (const SUnitWithMemInfo &Src : Chunks.back().FPExceptions)
1243 setLoopCarriedDep(Src.SU, FirstBarrier);
1244
1245 // LastBarrier -> FirstBarrier (if they are different)
1246 if (FirstBarrier != LastBarrier)
1247 setLoopCarriedDep(LastBarrier, FirstBarrier);
1248 }
1249}
1250
1251/// Add a chain edge between a load and store if the store can be an
1252/// alias of the load on a subsequent iteration, i.e., a loop carried
1253/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1254/// but that code doesn't create loop carried dependences.
1255/// TODO: Also compute output-dependencies.
1256LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1257 LoopCarriedEdges LCE;
1258
1259 // Add loop-carried order-dependencies
1260 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1261 LCODTracker.computeDependencies();
1262 for (unsigned I = 0; I != SUnits.size(); I++)
1263 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1264 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1265
1266 LCE.modifySUnits(SUnits, TII);
1267 return LCE;
1268}
1269
1270/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1271/// processes dependences for PHIs. This function adds true dependences
1272/// from a PHI to a use, and a loop carried dependence from the use to the
1273/// PHI. The loop carried dependence is represented as an anti dependence
1274/// edge. This function also removes chain dependences between unrelated
1275/// PHIs.
1276void SwingSchedulerDAG::updatePhiDependences() {
1277 SmallVector<SDep, 4> RemoveDeps;
1278 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1279
1280 // Iterate over each DAG node.
1281 for (SUnit &I : SUnits) {
1282 RemoveDeps.clear();
1283 // Set to true if the instruction has an operand defined by a Phi.
1284 Register HasPhiUse;
1285 Register HasPhiDef;
1286 MachineInstr *MI = I.getInstr();
1287 // Iterate over each operand, and we process the definitions.
1288 for (const MachineOperand &MO : MI->operands()) {
1289 if (!MO.isReg())
1290 continue;
1291 Register Reg = MO.getReg();
1292 if (MO.isDef()) {
1293 // If the register is used by a Phi, then create an anti dependence.
1295 UI = MRI.use_instr_begin(Reg),
1296 UE = MRI.use_instr_end();
1297 UI != UE; ++UI) {
1298 MachineInstr *UseMI = &*UI;
1299 SUnit *SU = getSUnit(UseMI);
1300 if (SU != nullptr && UseMI->isPHI()) {
1301 if (!MI->isPHI()) {
1302 SDep Dep(SU, SDep::Anti, Reg);
1303 Dep.setLatency(1);
1304 I.addPred(Dep);
1305 } else {
1306 HasPhiDef = Reg;
1307 // Add a chain edge to a dependent Phi that isn't an existing
1308 // predecessor.
1309
1310 // %3:intregs = PHI %21:intregs, %bb.6, %7:intregs, %bb.1 - SU0
1311 // %7:intregs = PHI %21:intregs, %bb.6, %13:intregs, %bb.1 - SU1
1312 // %27:intregs = A2_zxtb %3:intregs - SU2
1313 // %13:intregs = C2_muxri %45:predregs, 0, %46:intreg
1314 // If we have dependent phis, SU0 should be the successor of SU1
1315 // not the other way around. (it used to be SU1 is the successor
1316 // of SU0). In some cases, SU0 is scheduled earlier than SU1
1317 // resulting in bad IR as we do not have a value that can be used
1318 // by SU2.
1319
1320 if (SU->NodeNum < I.NodeNum && !SU->isPred(&I))
1321 SU->addPred(SDep(&I, SDep::Barrier));
1322 }
1323 }
1324 }
1325 } else if (MO.isUse()) {
1326 // If the register is defined by a Phi, then create a true dependence.
1327 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1328 if (DefMI == nullptr)
1329 continue;
1330 SUnit *SU = getSUnit(DefMI);
1331 if (SU != nullptr && DefMI->isPHI()) {
1332 if (!MI->isPHI()) {
1333 SDep Dep(SU, SDep::Data, Reg);
1334 Dep.setLatency(0);
1335 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1336 &SchedModel);
1337 I.addPred(Dep);
1338 } else {
1339 HasPhiUse = Reg;
1340 // Add a chain edge to a dependent Phi that isn't an existing
1341 // predecessor.
1342 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1343 I.addPred(SDep(SU, SDep::Barrier));
1344 }
1345 }
1346 }
1347 }
1348 // Remove order dependences from an unrelated Phi.
1349 if (!SwpPruneDeps)
1350 continue;
1351 for (auto &PI : I.Preds) {
1352 MachineInstr *PMI = PI.getSUnit()->getInstr();
1353 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1354 if (I.getInstr()->isPHI()) {
1355 if (PMI->getOperand(0).getReg() == HasPhiUse)
1356 continue;
1357 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1358 continue;
1359 }
1360 RemoveDeps.push_back(PI);
1361 }
1362 }
1363 for (const SDep &D : RemoveDeps)
1364 I.removePred(D);
1365 }
1366}
1367
1368/// Iterate over each DAG node and see if we can change any dependences
1369/// in order to reduce the recurrence MII.
1370void SwingSchedulerDAG::changeDependences() {
1371 // See if an instruction can use a value from the previous iteration.
1372 // If so, we update the base and offset of the instruction and change
1373 // the dependences.
1374 for (SUnit &I : SUnits) {
1375 unsigned BasePos = 0, OffsetPos = 0;
1376 Register NewBase;
1377 int64_t NewOffset = 0;
1378 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1379 NewOffset))
1380 continue;
1381
1382 // Get the MI and SUnit for the instruction that defines the original base.
1383 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1384 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1385 if (!DefMI)
1386 continue;
1387 SUnit *DefSU = getSUnit(DefMI);
1388 if (!DefSU)
1389 continue;
1390 // Get the MI and SUnit for the instruction that defins the new base.
1391 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1392 if (!LastMI)
1393 continue;
1394 SUnit *LastSU = getSUnit(LastMI);
1395 if (!LastSU)
1396 continue;
1397
1398 if (Topo.IsReachable(&I, LastSU))
1399 continue;
1400
1401 // Remove the dependence. The value now depends on a prior iteration.
1403 for (const SDep &P : I.Preds)
1404 if (P.getSUnit() == DefSU)
1405 Deps.push_back(P);
1406 for (const SDep &D : Deps) {
1407 Topo.RemovePred(&I, D.getSUnit());
1408 I.removePred(D);
1409 }
1410 // Remove the chain dependence between the instructions.
1411 Deps.clear();
1412 for (auto &P : LastSU->Preds)
1413 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1414 Deps.push_back(P);
1415 for (const SDep &D : Deps) {
1416 Topo.RemovePred(LastSU, D.getSUnit());
1417 LastSU->removePred(D);
1418 }
1419
1420 // Add a dependence between the new instruction and the instruction
1421 // that defines the new base.
1422 SDep Dep(&I, SDep::Anti, NewBase);
1423 Topo.AddPred(LastSU, &I);
1424 LastSU->addPred(Dep);
1425
1426 // Remember the base and offset information so that we can update the
1427 // instruction during code generation.
1428 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1429 }
1430}
1431
1432/// Create an instruction stream that represents a single iteration and stage of
1433/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1434/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1435/// function is an approximation of SMSchedule::finalizeSchedule with all
1436/// non-const operations removed.
1438 SMSchedule &Schedule,
1439 std::vector<MachineInstr *> &OrderedInsts,
1442
1443 // Move all instructions to the first stage from the later stages.
1444 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1445 ++Cycle) {
1446 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1447 Stage <= LastStage; ++Stage) {
1448 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1449 Cycle + Stage * Schedule.getInitiationInterval()))) {
1450 Instrs[Cycle].push_front(SU);
1451 }
1452 }
1453 }
1454
1455 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1456 ++Cycle) {
1457 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1458 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1459 for (SUnit *SU : CycleInstrs) {
1460 MachineInstr *MI = SU->getInstr();
1461 OrderedInsts.push_back(MI);
1462 Stages[MI] = Schedule.stageScheduled(SU);
1463 }
1464 }
1465}
1466
1467namespace {
1468
1469// FuncUnitSorter - Comparison operator used to sort instructions by
1470// the number of functional unit choices.
1471struct FuncUnitSorter {
1472 const InstrItineraryData *InstrItins;
1473 const MCSubtargetInfo *STI;
1474 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1475
1476 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1477 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1478
1479 // Compute the number of functional unit alternatives needed
1480 // at each stage, and take the minimum value. We prioritize the
1481 // instructions by the least number of choices first.
1482 unsigned minFuncUnits(const MachineInstr *Inst,
1483 InstrStage::FuncUnits &F) const {
1484 unsigned SchedClass = Inst->getDesc().getSchedClass();
1485 unsigned min = UINT_MAX;
1486 if (InstrItins && !InstrItins->isEmpty()) {
1487 for (const InstrStage &IS :
1488 make_range(InstrItins->beginStage(SchedClass),
1489 InstrItins->endStage(SchedClass))) {
1490 InstrStage::FuncUnits funcUnits = IS.getUnits();
1491 unsigned numAlternatives = llvm::popcount(funcUnits);
1492 if (numAlternatives < min) {
1493 min = numAlternatives;
1494 F = funcUnits;
1495 }
1496 }
1497 return min;
1498 }
1499 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1500 const MCSchedClassDesc *SCDesc =
1501 STI->getSchedModel().getSchedClassDesc(SchedClass);
1502 if (!SCDesc->isValid())
1503 // No valid Schedule Class Desc for schedClass, should be
1504 // Pseudo/PostRAPseudo
1505 return min;
1506
1507 for (const MCWriteProcResEntry &PRE :
1508 make_range(STI->getWriteProcResBegin(SCDesc),
1509 STI->getWriteProcResEnd(SCDesc))) {
1510 if (!PRE.ReleaseAtCycle)
1511 continue;
1512 const MCProcResourceDesc *ProcResource =
1513 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1514 unsigned NumUnits = ProcResource->NumUnits;
1515 if (NumUnits < min) {
1516 min = NumUnits;
1517 F = PRE.ProcResourceIdx;
1518 }
1519 }
1520 return min;
1521 }
1522 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1523 }
1524
1525 // Compute the critical resources needed by the instruction. This
1526 // function records the functional units needed by instructions that
1527 // must use only one functional unit. We use this as a tie breaker
1528 // for computing the resource MII. The instrutions that require
1529 // the same, highly used, functional unit have high priority.
1530 void calcCriticalResources(MachineInstr &MI) {
1531 unsigned SchedClass = MI.getDesc().getSchedClass();
1532 if (InstrItins && !InstrItins->isEmpty()) {
1533 for (const InstrStage &IS :
1534 make_range(InstrItins->beginStage(SchedClass),
1535 InstrItins->endStage(SchedClass))) {
1536 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1537 if (llvm::popcount(FuncUnits) == 1)
1538 Resources[FuncUnits]++;
1539 }
1540 return;
1541 }
1542 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1543 const MCSchedClassDesc *SCDesc =
1544 STI->getSchedModel().getSchedClassDesc(SchedClass);
1545 if (!SCDesc->isValid())
1546 // No valid Schedule Class Desc for schedClass, should be
1547 // Pseudo/PostRAPseudo
1548 return;
1549
1550 for (const MCWriteProcResEntry &PRE :
1551 make_range(STI->getWriteProcResBegin(SCDesc),
1552 STI->getWriteProcResEnd(SCDesc))) {
1553 if (!PRE.ReleaseAtCycle)
1554 continue;
1555 Resources[PRE.ProcResourceIdx]++;
1556 }
1557 return;
1558 }
1559 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1560 }
1561
1562 /// Return true if IS1 has less priority than IS2.
1563 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1564 InstrStage::FuncUnits F1 = 0, F2 = 0;
1565 unsigned MFUs1 = minFuncUnits(IS1, F1);
1566 unsigned MFUs2 = minFuncUnits(IS2, F2);
1567 if (MFUs1 == MFUs2)
1568 return Resources.lookup(F1) < Resources.lookup(F2);
1569 return MFUs1 > MFUs2;
1570 }
1571};
1572
1573/// Calculate the maximum register pressure of the scheduled instructions stream
1574class HighRegisterPressureDetector {
1575 MachineBasicBlock *OrigMBB;
1576 const MachineRegisterInfo &MRI;
1577 const TargetRegisterInfo *TRI;
1578
1579 const unsigned PSetNum;
1580
1581 // Indexed by PSet ID
1582 // InitSetPressure takes into account the register pressure of live-in
1583 // registers. It's not depend on how the loop is scheduled, so it's enough to
1584 // calculate them once at the beginning.
1585 std::vector<unsigned> InitSetPressure;
1586
1587 // Indexed by PSet ID
1588 // Upper limit for each register pressure set
1589 std::vector<unsigned> PressureSetLimit;
1590
1591 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1592
1593 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1594
1595public:
1596 using OrderedInstsTy = std::vector<MachineInstr *>;
1597 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1598
1599private:
1600 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1601 if (Pressures.size() == 0) {
1602 dbgs() << "[]";
1603 } else {
1604 char Prefix = '[';
1605 for (unsigned P : Pressures) {
1606 dbgs() << Prefix << P;
1607 Prefix = ' ';
1608 }
1609 dbgs() << ']';
1610 }
1611 }
1612
1613 void dumpPSet(Register Reg) const {
1614 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1615 // FIXME: The static_cast is a bug compensating bugs in the callers.
1616 VirtRegOrUnit VRegOrUnit =
1617 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1618 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1619 for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
1620 ++PSetIter) {
1621 dbgs() << *PSetIter << ' ';
1622 }
1623 dbgs() << '\n';
1624 }
1625
1626 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1627 Register Reg) const {
1628 // FIXME: The static_cast is a bug compensating bugs in the callers.
1629 VirtRegOrUnit VRegOrUnit =
1630 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1631 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1632 auto PSetIter = MRI.getPressureSets(VRegOrUnit);
1633 unsigned Weight = PSetIter.getWeight();
1634 for (; PSetIter.isValid(); ++PSetIter)
1635 Pressure[*PSetIter] += Weight;
1636 }
1637
1638 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1639 Register Reg) const {
1640 auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
1641 unsigned Weight = PSetIter.getWeight();
1642 for (; PSetIter.isValid(); ++PSetIter) {
1643 auto &P = Pressure[*PSetIter];
1644 assert(P >= Weight &&
1645 "register pressure must be greater than or equal weight");
1646 P -= Weight;
1647 }
1648 }
1649
1650 // Return true if Reg is reserved one, for example, stack pointer
1651 bool isReservedRegister(Register Reg) const {
1652 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1653 }
1654
1655 bool isDefinedInThisLoop(Register Reg) const {
1656 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1657 }
1658
1659 // Search for live-in variables. They are factored into the register pressure
1660 // from the begining. Live-in variables used by every iteration should be
1661 // considered as alive throughout the loop. For example, the variable `c` in
1662 // following code. \code
1663 // int c = ...;
1664 // for (int i = 0; i < n; i++)
1665 // a[i] += b[i] + c;
1666 // \endcode
1667 void computeLiveIn() {
1668 DenseSet<Register> Used;
1669 for (auto &MI : *OrigMBB) {
1670 if (MI.isDebugInstr())
1671 continue;
1672 for (auto &Use : ROMap[&MI].Uses) {
1673 // FIXME: The static_cast is a bug.
1674 Register Reg =
1675 Use.VRegOrUnit.isVirtualReg()
1676 ? Use.VRegOrUnit.asVirtualReg()
1677 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1678 // Ignore the variable that appears only on one side of phi instruction
1679 // because it's used only at the first iteration.
1680 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1681 continue;
1682 if (isReservedRegister(Reg))
1683 continue;
1684 if (isDefinedInThisLoop(Reg))
1685 continue;
1686 Used.insert(Reg);
1687 }
1688 }
1689
1690 for (auto LiveIn : Used)
1691 increaseRegisterPressure(InitSetPressure, LiveIn);
1692 }
1693
1694 // Calculate the upper limit of each pressure set
1695 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1696 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1697 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1698 }
1699
1700 // There are two patterns of last-use.
1701 // - by an instruction of the current iteration
1702 // - by a phi instruction of the next iteration (loop carried value)
1703 //
1704 // Furthermore, following two groups of instructions are executed
1705 // simultaneously
1706 // - next iteration's phi instructions in i-th stage
1707 // - current iteration's instructions in i+1-th stage
1708 //
1709 // This function calculates the last-use of each register while taking into
1710 // account the above two patterns.
1711 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1712 Instr2StageTy &Stages) const {
1713 // We treat virtual registers that are defined and used in this loop.
1714 // Following virtual register will be ignored
1715 // - live-in one
1716 // - defined but not used in the loop (potentially live-out)
1717 DenseSet<Register> TargetRegs;
1718 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1719 if (isDefinedInThisLoop(Reg))
1720 TargetRegs.insert(Reg);
1721 };
1722 for (MachineInstr *MI : OrderedInsts) {
1723 if (MI->isPHI()) {
1724 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1725 UpdateTargetRegs(Reg);
1726 } else {
1727 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1728 // FIXME: The static_cast is a bug.
1729 Register Reg = Use.VRegOrUnit.isVirtualReg()
1730 ? Use.VRegOrUnit.asVirtualReg()
1731 : Register(static_cast<unsigned>(
1732 Use.VRegOrUnit.asMCRegUnit()));
1733 UpdateTargetRegs(Reg);
1734 }
1735 }
1736 }
1737
1738 const auto InstrScore = [&Stages](MachineInstr *MI) {
1739 return Stages[MI] + MI->isPHI();
1740 };
1741
1742 DenseMap<Register, MachineInstr *> LastUseMI;
1743 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1744 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1745 // FIXME: The static_cast is a bug.
1746 Register Reg =
1747 Use.VRegOrUnit.isVirtualReg()
1748 ? Use.VRegOrUnit.asVirtualReg()
1749 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1750 if (!TargetRegs.contains(Reg))
1751 continue;
1752 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1753 if (!Inserted) {
1754 MachineInstr *Orig = Ite->second;
1755 MachineInstr *New = MI;
1756 if (InstrScore(Orig) < InstrScore(New))
1757 Ite->second = New;
1758 }
1759 }
1760 }
1761
1762 Instr2LastUsesTy LastUses;
1763 for (auto [Reg, MI] : LastUseMI)
1764 LastUses[MI].insert(Reg);
1765 return LastUses;
1766 }
1767
1768 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1769 // iterations and check the register pressure at the point where all stages
1770 // overlapping.
1771 //
1772 // An example of unrolled loop where #Stage is 4..
1773 // Iter i+0 i+1 i+2 i+3
1774 // ------------------------
1775 // Stage 0
1776 // Stage 1 0
1777 // Stage 2 1 0
1778 // Stage 3 2 1 0 <- All stages overlap
1779 //
1780 std::vector<unsigned>
1781 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1782 Instr2StageTy &Stages,
1783 const unsigned StageCount) const {
1784 using RegSetTy = SmallDenseSet<Register, 16>;
1785
1786 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1787 // manage the liveness of the registers independently by iterations.
1788 SmallVector<RegSetTy> LiveRegSets(StageCount);
1789
1790 auto CurSetPressure = InitSetPressure;
1791 auto MaxSetPressure = InitSetPressure;
1792 auto LastUses = computeLastUses(OrderedInsts, Stages);
1793
1794 LLVM_DEBUG({
1795 dbgs() << "Ordered instructions:\n";
1796 for (MachineInstr *MI : OrderedInsts) {
1797 dbgs() << "Stage " << Stages[MI] << ": ";
1798 MI->dump();
1799 }
1800 });
1801
1802 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1803 VirtRegOrUnit VRegOrUnit) {
1804 // FIXME: The static_cast is a bug.
1805 Register Reg =
1806 VRegOrUnit.isVirtualReg()
1807 ? VRegOrUnit.asVirtualReg()
1808 : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
1809 if (!Reg.isValid() || isReservedRegister(Reg))
1810 return;
1811
1812 bool Inserted = RegSet.insert(Reg).second;
1813 if (!Inserted)
1814 return;
1815
1816 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1817 increaseRegisterPressure(CurSetPressure, Reg);
1818 LLVM_DEBUG(dumpPSet(Reg));
1819 };
1820
1821 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1822 Register Reg) {
1823 if (!Reg.isValid() || isReservedRegister(Reg))
1824 return;
1825
1826 // live-in register
1827 if (!RegSet.contains(Reg))
1828 return;
1829
1830 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1831 RegSet.erase(Reg);
1832 decreaseRegisterPressure(CurSetPressure, Reg);
1833 LLVM_DEBUG(dumpPSet(Reg));
1834 };
1835
1836 for (unsigned I = 0; I < StageCount; I++) {
1837 for (MachineInstr *MI : OrderedInsts) {
1838 const auto Stage = Stages[MI];
1839 if (I < Stage)
1840 continue;
1841
1842 const unsigned Iter = I - Stage;
1843
1844 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1845 InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
1846
1847 for (auto LastUse : LastUses[MI]) {
1848 if (MI->isPHI()) {
1849 if (Iter != 0)
1850 EraseReg(LiveRegSets[Iter - 1], LastUse);
1851 } else {
1852 EraseReg(LiveRegSets[Iter], LastUse);
1853 }
1854 }
1855
1856 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1857 MaxSetPressure[PSet] =
1858 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1859
1860 LLVM_DEBUG({
1861 dbgs() << "CurSetPressure=";
1862 dumpRegisterPressures(CurSetPressure);
1863 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1864 MI->dump();
1865 });
1866 }
1867 }
1868
1869 return MaxSetPressure;
1870 }
1871
1872public:
1873 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1874 const MachineFunction &MF)
1875 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1876 TRI(MF.getSubtarget().getRegisterInfo()),
1877 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1878 PressureSetLimit(PSetNum, 0) {}
1879
1880 // Used to calculate register pressure, which is independent of loop
1881 // scheduling.
1882 void init(const RegisterClassInfo &RCI) {
1883 for (MachineInstr &MI : *OrigMBB) {
1884 if (MI.isDebugInstr())
1885 continue;
1886 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1887 }
1888
1889 computeLiveIn();
1890 computePressureSetLimit(RCI);
1891 }
1892
1893 // Calculate the maximum register pressures of the loop and check if they
1894 // exceed the limit
1895 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1896 const unsigned MaxStage) const {
1898 "the percentage of the margin must be between 0 to 100");
1899
1900 OrderedInstsTy OrderedInsts;
1901 Instr2StageTy Stages;
1902 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1903 const auto MaxSetPressure =
1904 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1905
1906 LLVM_DEBUG({
1907 dbgs() << "Dump MaxSetPressure:\n";
1908 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1909 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1910 }
1911 dbgs() << '\n';
1912 });
1913
1914 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1915 unsigned Limit = PressureSetLimit[PSet];
1916 unsigned Margin = Limit * RegPressureMargin / 100;
1917 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1918 << " Margin=" << Margin << "\n");
1919 if (Limit < MaxSetPressure[PSet] + Margin) {
1920 LLVM_DEBUG(
1921 dbgs()
1922 << "Rejected the schedule because of too high register pressure\n");
1923 return true;
1924 }
1925 }
1926 return false;
1927 }
1928};
1929
1930} // end anonymous namespace
1931
1932/// Calculate the resource constrained minimum initiation interval for the
1933/// specified loop. We use the DFA to model the resources needed for
1934/// each instruction, and we ignore dependences. A different DFA is created
1935/// for each cycle that is required. When adding a new instruction, we attempt
1936/// to add it to each existing DFA, until a legal space is found. If the
1937/// instruction cannot be reserved in an existing DFA, we create a new one.
1938unsigned SwingSchedulerDAG::calculateResMII() {
1939 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1940 ResourceManager RM(&MF.getSubtarget(), this);
1941 return RM.calculateResMII();
1942}
1943
1944/// Calculate the recurrence-constrainted minimum initiation interval.
1945/// Iterate over each circuit. Compute the delay(c) and distance(c)
1946/// for each circuit. The II needs to satisfy the inequality
1947/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1948/// II that satisfies the inequality, and the RecMII is the maximum
1949/// of those values.
1950unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1951 unsigned RecMII = 0;
1952
1953 for (NodeSet &Nodes : NodeSets) {
1954 if (Nodes.empty())
1955 continue;
1956
1957 unsigned Delay = Nodes.getLatency();
1958 unsigned Distance = 1;
1959
1960 // ii = ceil(delay / distance)
1961 unsigned CurMII = (Delay + Distance - 1) / Distance;
1962 Nodes.setRecMII(CurMII);
1963 if (CurMII > RecMII)
1964 RecMII = CurMII;
1965 }
1966
1967 return RecMII;
1968}
1969
1970/// Create the adjacency structure of the nodes in the graph.
1971void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1972 SwingSchedulerDDG *DDG) {
1973 BitVector Added(SUnits.size());
1974 DenseMap<int, int> OutputDeps;
1975 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1976 Added.reset();
1977 // Add any successor to the adjacency matrix and exclude duplicates.
1978 for (auto &OE : DDG->getOutEdges(&SUnits[i])) {
1979 // Only create a back-edge on the first and last nodes of a dependence
1980 // chain. This records any chains and adds them later.
1981 if (OE.isOutputDep()) {
1982 int N = OE.getDst()->NodeNum;
1983 int BackEdge = i;
1984 auto Dep = OutputDeps.find(BackEdge);
1985 if (Dep != OutputDeps.end()) {
1986 BackEdge = Dep->second;
1987 OutputDeps.erase(Dep);
1988 }
1989 OutputDeps[N] = BackEdge;
1990 }
1991 // Do not process a boundary node, an artificial node.
1992 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1993 continue;
1994
1995 // This code is retained o preserve previous behavior and prevent
1996 // regression. This condition means that anti-dependnecies within an
1997 // iteration are ignored when searching circuits. Therefore it's natural
1998 // to consider this dependence as well.
1999 // FIXME: Remove this code if it doesn't have significant impact on
2000 // performance.
2001 if (OE.isAntiDep())
2002 continue;
2003
2004 int N = OE.getDst()->NodeNum;
2005 if (!Added.test(N)) {
2006 AdjK[i].push_back(N);
2007 Added.set(N);
2008 }
2009 }
2010
2011 // Also add any extra out edges to the adjacency matrix.
2012 for (const SUnit *Dst : DDG->getExtraOutEdges(&SUnits[i])) {
2013 int N = Dst->NodeNum;
2014 if (!Added.test(N)) {
2015 AdjK[i].push_back(N);
2016 Added.set(N);
2017 }
2018 }
2019 }
2020
2021 // Add back-edges in the adjacency matrix for the output dependences.
2022 for (auto &OD : OutputDeps)
2023 if (!Added.test(OD.second)) {
2024 AdjK[OD.first].push_back(OD.second);
2025 Added.set(OD.second);
2026 }
2027}
2028
2029/// Identify an elementary circuit in the dependence graph starting at the
2030/// specified node.
2031bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
2032 const SwingSchedulerDAG *DAG,
2033 bool HasBackedge) {
2034 SUnit *SV = &SUnits[V];
2035 bool F = false;
2036 Stack.insert(SV);
2037 Blocked.set(V);
2038
2039 for (auto W : AdjK[V]) {
2040 if (NumPaths > MaxPaths)
2041 break;
2042 if (W < S)
2043 continue;
2044 if (W == S) {
2045 if (!HasBackedge)
2046 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
2047 F = true;
2048 ++NumPaths;
2049 break;
2050 }
2051 if (!Blocked.test(W)) {
2052 if (circuit(W, S, NodeSets, DAG,
2053 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
2054 F = true;
2055 }
2056 }
2057
2058 if (F)
2059 unblock(V);
2060 else {
2061 for (auto W : AdjK[V]) {
2062 if (W < S)
2063 continue;
2064 B[W].insert(SV);
2065 }
2066 }
2067 Stack.pop_back();
2068 return F;
2069}
2070
2071/// Unblock a node in the circuit finding algorithm.
2072void SwingSchedulerDAG::Circuits::unblock(int U) {
2073 Blocked.reset(U);
2074 SmallPtrSet<SUnit *, 4> &BU = B[U];
2075 while (!BU.empty()) {
2076 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
2077 assert(SI != BU.end() && "Invalid B set.");
2078 SUnit *W = *SI;
2079 BU.erase(W);
2080 if (Blocked.test(W->NodeNum))
2081 unblock(W->NodeNum);
2082 }
2083}
2084
2085/// Identify all the elementary circuits in the dependence graph using
2086/// Johnson's circuit algorithm.
2087void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
2088 Circuits Cir(SUnits, Topo);
2089 // Create the adjacency structure.
2090 Cir.createAdjacencyStructure(&*DDG);
2091 for (int I = 0, E = SUnits.size(); I != E; ++I) {
2092 Cir.reset();
2093 Cir.circuit(I, I, NodeSets, this);
2094 }
2095}
2096
2097// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
2098// is loop-carried to the USE in next iteration. This will help pipeliner avoid
2099// additional copies that are needed across iterations. An artificial dependence
2100// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
2101
2102// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
2103// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
2104// PHI-------True-Dep------> USEOfPhi
2105
2106// The mutation creates
2107// USEOfPHI -------Artificial-Dep---> SRCOfCopy
2108
2109// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
2110// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
2111// late to avoid additional copies across iterations. The possible scheduling
2112// order would be
2113// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
2114
2115void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
2116 for (SUnit &SU : DAG->SUnits) {
2117 // Find the COPY/REG_SEQUENCE instruction.
2118 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
2119 continue;
2120
2121 // Record the loop carried PHIs.
2123 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2125
2126 for (auto &Dep : SU.Preds) {
2127 SUnit *TmpSU = Dep.getSUnit();
2128 MachineInstr *TmpMI = TmpSU->getInstr();
2129 SDep::Kind DepKind = Dep.getKind();
2130 // Save the loop carried PHI.
2131 if (DepKind == SDep::Anti && TmpMI->isPHI())
2132 PHISUs.push_back(TmpSU);
2133 // Save the source of COPY/REG_SEQUENCE.
2134 // If the source has no pre-decessors, we will end up creating cycles.
2135 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2136 SrcSUs.push_back(TmpSU);
2137 }
2138
2139 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2140 continue;
2141
2142 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2143 // SUnit to the container.
2145 // Do not use iterator based loop here as we are updating the container.
2146 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2147 for (auto &Dep : PHISUs[Index]->Succs) {
2148 if (Dep.getKind() != SDep::Data)
2149 continue;
2150
2151 SUnit *TmpSU = Dep.getSUnit();
2152 MachineInstr *TmpMI = TmpSU->getInstr();
2153 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2154 PHISUs.push_back(TmpSU);
2155 continue;
2156 }
2157 UseSUs.push_back(TmpSU);
2158 }
2159 }
2160
2161 if (UseSUs.size() == 0)
2162 continue;
2163
2164 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2165 // Add the artificial dependencies if it does not form a cycle.
2166 for (auto *I : UseSUs) {
2167 for (auto *Src : SrcSUs) {
2168 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2169 Src->addPred(SDep(I, SDep::Artificial));
2170 SDAG->Topo.AddPred(Src, I);
2171 }
2172 }
2173 }
2174 }
2175}
2176
2177/// Compute several functions need to order the nodes for scheduling.
2178/// ASAP - Earliest time to schedule a node.
2179/// ALAP - Latest time to schedule a node.
2180/// MOV - Mobility function, difference between ALAP and ASAP.
2181/// D - Depth of each node.
2182/// H - Height of each node.
2183void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2184 ScheduleInfo.resize(SUnits.size());
2185
2186 LLVM_DEBUG({
2187 for (int I : Topo) {
2188 const SUnit &SU = SUnits[I];
2189 dumpNode(SU);
2190 }
2191 });
2192
2193 int maxASAP = 0;
2194 // Compute ASAP and ZeroLatencyDepth.
2195 for (int I : Topo) {
2196 int asap = 0;
2197 int zeroLatencyDepth = 0;
2198 SUnit *SU = &SUnits[I];
2199 for (const auto &IE : DDG->getInEdges(SU)) {
2200 SUnit *Pred = IE.getSrc();
2201 if (IE.getLatency() == 0)
2202 zeroLatencyDepth =
2203 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2204 if (IE.ignoreDependence(true))
2205 continue;
2206 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2207 IE.getDistance() * MII));
2208 }
2209 maxASAP = std::max(maxASAP, asap);
2210 ScheduleInfo[I].ASAP = asap;
2211 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2212 }
2213
2214 // Compute ALAP, ZeroLatencyHeight, and MOV.
2215 for (int I : llvm::reverse(Topo)) {
2216 int alap = maxASAP;
2217 int zeroLatencyHeight = 0;
2218 SUnit *SU = &SUnits[I];
2219 for (const auto &OE : DDG->getOutEdges(SU)) {
2220 SUnit *Succ = OE.getDst();
2221 if (Succ->isBoundaryNode())
2222 continue;
2223 if (OE.getLatency() == 0)
2224 zeroLatencyHeight =
2225 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2226 if (OE.ignoreDependence(true))
2227 continue;
2228 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2229 OE.getDistance() * MII));
2230 }
2231
2232 ScheduleInfo[I].ALAP = alap;
2233 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2234 }
2235
2236 // After computing the node functions, compute the summary for each node set.
2237 for (NodeSet &I : NodeSets)
2238 I.computeNodeSetInfo(this);
2239
2240 LLVM_DEBUG({
2241 for (unsigned i = 0; i < SUnits.size(); i++) {
2242 dbgs() << "\tNode " << i << ":\n";
2243 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2244 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2245 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2246 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2247 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2248 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2249 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2250 }
2251 });
2252}
2253
2254/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2255/// as the predecessors of the elements of NodeOrder that are not also in
2256/// NodeOrder.
2259 const NodeSet *S = nullptr) {
2260 Preds.clear();
2261
2262 for (SUnit *SU : NodeOrder) {
2263 for (const auto &IE : DDG->getInEdges(SU)) {
2264 SUnit *PredSU = IE.getSrc();
2265 if (S && S->count(PredSU) == 0)
2266 continue;
2267 if (IE.ignoreDependence(true))
2268 continue;
2269 if (NodeOrder.count(PredSU) == 0)
2270 Preds.insert(PredSU);
2271 }
2272
2273 // FIXME: The following loop-carried dependencies may also need to be
2274 // considered.
2275 // - Physical register dependencies (true-dependence and WAW).
2276 // - Memory dependencies.
2277 for (const auto &OE : DDG->getOutEdges(SU)) {
2278 SUnit *SuccSU = OE.getDst();
2279 if (!OE.isAntiDep())
2280 continue;
2281 if (S && S->count(SuccSU) == 0)
2282 continue;
2283 if (NodeOrder.count(SuccSU) == 0)
2284 Preds.insert(SuccSU);
2285 }
2286 }
2287 return !Preds.empty();
2288}
2289
2290/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2291/// as the successors of the elements of NodeOrder that are not also in
2292/// NodeOrder.
2295 const NodeSet *S = nullptr) {
2296 Succs.clear();
2297
2298 for (SUnit *SU : NodeOrder) {
2299 for (const auto &OE : DDG->getOutEdges(SU)) {
2300 SUnit *SuccSU = OE.getDst();
2301 if (S && S->count(SuccSU) == 0)
2302 continue;
2303 if (OE.ignoreDependence(false))
2304 continue;
2305 if (NodeOrder.count(SuccSU) == 0)
2306 Succs.insert(SuccSU);
2307 }
2308
2309 // FIXME: The following loop-carried dependencies may also need to be
2310 // considered.
2311 // - Physical register dependnecies (true-dependnece and WAW).
2312 // - Memory dependencies.
2313 for (const auto &IE : DDG->getInEdges(SU)) {
2314 SUnit *PredSU = IE.getSrc();
2315 if (!IE.isAntiDep())
2316 continue;
2317 if (S && S->count(PredSU) == 0)
2318 continue;
2319 if (NodeOrder.count(PredSU) == 0)
2320 Succs.insert(PredSU);
2321 }
2322 }
2323 return !Succs.empty();
2324}
2325
2326/// Return true if there is a path from the specified node to any of the nodes
2327/// in DestNodes. Keep track and return the nodes in any path.
2328static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2329 SetVector<SUnit *> &DestNodes,
2330 SetVector<SUnit *> &Exclude,
2331 SmallPtrSet<SUnit *, 8> &Visited,
2332 SwingSchedulerDDG *DDG) {
2333 if (Cur->isBoundaryNode())
2334 return false;
2335 if (Exclude.contains(Cur))
2336 return false;
2337 if (DestNodes.contains(Cur))
2338 return true;
2339 if (!Visited.insert(Cur).second)
2340 return Path.contains(Cur);
2341 bool FoundPath = false;
2342 for (const auto &OE : DDG->getOutEdges(Cur))
2343 if (!OE.ignoreDependence(false))
2344 FoundPath |=
2345 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2346 for (const auto &IE : DDG->getInEdges(Cur))
2347 if (IE.isAntiDep() && IE.getDistance() == 0)
2348 FoundPath |=
2349 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2350 if (FoundPath)
2351 Path.insert(Cur);
2352 return FoundPath;
2353}
2354
2355/// Compute the live-out registers for the instructions in a node-set.
2356/// The live-out registers are those that are defined in the node-set,
2357/// but not used. Except for use operands of Phis.
2359 NodeSet &NS) {
2361 MachineRegisterInfo &MRI = MF.getRegInfo();
2364 for (SUnit *SU : NS) {
2365 const MachineInstr *MI = SU->getInstr();
2366 if (MI->isPHI())
2367 continue;
2368 for (const MachineOperand &MO : MI->all_uses()) {
2369 Register Reg = MO.getReg();
2370 if (Reg.isVirtual())
2371 Uses.insert(VirtRegOrUnit(Reg));
2372 else if (MRI.isAllocatable(Reg))
2373 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2374 Uses.insert(VirtRegOrUnit(Unit));
2375 }
2376 }
2377 for (SUnit *SU : NS)
2378 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2379 if (!MO.isDead()) {
2380 Register Reg = MO.getReg();
2381 if (Reg.isVirtual()) {
2382 if (!Uses.count(VirtRegOrUnit(Reg)))
2383 LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
2385 } else if (MRI.isAllocatable(Reg)) {
2386 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2387 if (!Uses.count(VirtRegOrUnit(Unit)))
2388 LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
2390 }
2391 }
2392 RPTracker.addLiveRegs(LiveOutRegs);
2393}
2394
2395/// A heuristic to filter nodes in recurrent node-sets if the register
2396/// pressure of a set is too high.
2397void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2398 for (auto &NS : NodeSets) {
2399 // Skip small node-sets since they won't cause register pressure problems.
2400 if (NS.size() <= 2)
2401 continue;
2402 IntervalPressure RecRegPressure;
2403 RegPressureTracker RecRPTracker(RecRegPressure);
2404 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2405 computeLiveOuts(MF, RecRPTracker, NS);
2406 RecRPTracker.closeBottom();
2407
2408 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2409 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2410 return A->NodeNum > B->NodeNum;
2411 });
2412
2413 for (auto &SU : SUnits) {
2414 // Since we're computing the register pressure for a subset of the
2415 // instructions in a block, we need to set the tracker for each
2416 // instruction in the node-set. The tracker is set to the instruction
2417 // just after the one we're interested in.
2419 RecRPTracker.setPos(std::next(CurInstI));
2420
2421 RegPressureDelta RPDelta;
2422 ArrayRef<PressureChange> CriticalPSets;
2423 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2424 CriticalPSets,
2425 RecRegPressure.MaxSetPressure);
2426 if (RPDelta.Excess.isValid()) {
2427 LLVM_DEBUG(
2428 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2429 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2430 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2431 NS.setExceedPressure(SU);
2432 break;
2433 }
2434 RecRPTracker.recede();
2435 }
2436 }
2437}
2438
2439/// A heuristic to colocate node sets that have the same set of
2440/// successors.
2441void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2442 unsigned Colocate = 0;
2443 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2444 NodeSet &N1 = NodeSets[i];
2445 SmallSetVector<SUnit *, 8> S1;
2446 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2447 continue;
2448 for (int j = i + 1; j < e; ++j) {
2449 NodeSet &N2 = NodeSets[j];
2450 if (N1.compareRecMII(N2) != 0)
2451 continue;
2452 SmallSetVector<SUnit *, 8> S2;
2453 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2454 continue;
2455 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2456 N1.setColocate(++Colocate);
2457 N2.setColocate(Colocate);
2458 break;
2459 }
2460 }
2461 }
2462}
2463
2464/// Check if the existing node-sets are profitable. If not, then ignore the
2465/// recurrent node-sets, and attempt to schedule all nodes together. This is
2466/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2467/// then it's best to try to schedule all instructions together instead of
2468/// starting with the recurrent node-sets.
2469void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2470 // Look for loops with a large MII.
2471 if (MII < 17)
2472 return;
2473 // Check if the node-set contains only a simple add recurrence.
2474 for (auto &NS : NodeSets) {
2475 if (NS.getRecMII() > 2)
2476 return;
2477 if (NS.getMaxDepth() > MII)
2478 return;
2479 }
2480 NodeSets.clear();
2481 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2482}
2483
2484/// Add the nodes that do not belong to a recurrence set into groups
2485/// based upon connected components.
2486void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2487 SetVector<SUnit *> NodesAdded;
2488 SmallPtrSet<SUnit *, 8> Visited;
2489 // Add the nodes that are on a path between the previous node sets and
2490 // the current node set.
2491 for (NodeSet &I : NodeSets) {
2492 SmallSetVector<SUnit *, 8> N;
2493 // Add the nodes from the current node set to the previous node set.
2494 if (succ_L(I, N, DDG.get())) {
2495 SetVector<SUnit *> Path;
2496 for (SUnit *NI : N) {
2497 Visited.clear();
2498 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2499 }
2500 if (!Path.empty())
2501 I.insert(Path.begin(), Path.end());
2502 }
2503 // Add the nodes from the previous node set to the current node set.
2504 N.clear();
2505 if (succ_L(NodesAdded, N, DDG.get())) {
2506 SetVector<SUnit *> Path;
2507 for (SUnit *NI : N) {
2508 Visited.clear();
2509 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2510 }
2511 if (!Path.empty())
2512 I.insert(Path.begin(), Path.end());
2513 }
2514 NodesAdded.insert_range(I);
2515 }
2516
2517 // Create a new node set with the connected nodes of any successor of a node
2518 // in a recurrent set.
2519 NodeSet NewSet;
2520 SmallSetVector<SUnit *, 8> N;
2521 if (succ_L(NodesAdded, N, DDG.get()))
2522 for (SUnit *I : N)
2523 addConnectedNodes(I, NewSet, NodesAdded);
2524 if (!NewSet.empty())
2525 NodeSets.push_back(NewSet);
2526
2527 // Create a new node set with the connected nodes of any predecessor of a node
2528 // in a recurrent set.
2529 NewSet.clear();
2530 if (pred_L(NodesAdded, N, DDG.get()))
2531 for (SUnit *I : N)
2532 addConnectedNodes(I, NewSet, NodesAdded);
2533 if (!NewSet.empty())
2534 NodeSets.push_back(NewSet);
2535
2536 // Create new nodes sets with the connected nodes any remaining node that
2537 // has no predecessor.
2538 for (SUnit &SU : SUnits) {
2539 if (NodesAdded.count(&SU) == 0) {
2540 NewSet.clear();
2541 addConnectedNodes(&SU, NewSet, NodesAdded);
2542 if (!NewSet.empty())
2543 NodeSets.push_back(NewSet);
2544 }
2545 }
2546}
2547
2548/// Add the node to the set, and add all of its connected nodes to the set.
2549void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2550 SetVector<SUnit *> &NodesAdded) {
2551 NewSet.insert(SU);
2552 NodesAdded.insert(SU);
2553 for (auto &OE : DDG->getOutEdges(SU)) {
2554 SUnit *Successor = OE.getDst();
2555 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2556 NodesAdded.count(Successor) == 0)
2557 addConnectedNodes(Successor, NewSet, NodesAdded);
2558 }
2559 for (auto &IE : DDG->getInEdges(SU)) {
2560 SUnit *Predecessor = IE.getSrc();
2561 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2562 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2563 }
2564}
2565
2566/// Return true if Set1 contains elements in Set2. The elements in common
2567/// are returned in a different container.
2568static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2570 Result.clear();
2571 for (SUnit *SU : Set1) {
2572 if (Set2.count(SU) != 0)
2573 Result.insert(SU);
2574 }
2575 return !Result.empty();
2576}
2577
2578/// Merge the recurrence node sets that have the same initial node.
2579void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2580 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2581 ++I) {
2582 NodeSet &NI = *I;
2583 for (NodeSetType::iterator J = I + 1; J != E;) {
2584 NodeSet &NJ = *J;
2585 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2586 if (NJ.compareRecMII(NI) > 0)
2587 NI.setRecMII(NJ.getRecMII());
2588 for (SUnit *SU : *J)
2589 I->insert(SU);
2590 NodeSets.erase(J);
2591 E = NodeSets.end();
2592 } else {
2593 ++J;
2594 }
2595 }
2596 }
2597}
2598
2599/// Remove nodes that have been scheduled in previous NodeSets.
2600void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2601 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2602 ++I)
2603 for (NodeSetType::iterator J = I + 1; J != E;) {
2604 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2605
2606 if (J->empty()) {
2607 NodeSets.erase(J);
2608 E = NodeSets.end();
2609 } else {
2610 ++J;
2611 }
2612 }
2613}
2614
2615/// Compute an ordered list of the dependence graph nodes, which
2616/// indicates the order that the nodes will be scheduled. This is a
2617/// two-level algorithm. First, a partial order is created, which
2618/// consists of a list of sets ordered from highest to lowest priority.
2619void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2620 SmallSetVector<SUnit *, 8> R;
2621 NodeOrder.clear();
2622
2623 for (auto &Nodes : NodeSets) {
2624 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2625 OrderKind Order;
2626 SmallSetVector<SUnit *, 8> N;
2627 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2628 R.insert_range(N);
2629 Order = BottomUp;
2630 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2631 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2632 llvm::set_is_subset(N, Nodes)) {
2633 R.insert_range(N);
2634 Order = TopDown;
2635 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2636 } else if (isIntersect(N, Nodes, R)) {
2637 // If some of the successors are in the existing node-set, then use the
2638 // top-down ordering.
2639 Order = TopDown;
2640 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2641 } else if (NodeSets.size() == 1) {
2642 for (const auto &N : Nodes)
2643 if (N->Succs.size() == 0)
2644 R.insert(N);
2645 Order = BottomUp;
2646 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2647 } else {
2648 // Find the node with the highest ASAP.
2649 SUnit *maxASAP = nullptr;
2650 for (SUnit *SU : Nodes) {
2651 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2652 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2653 maxASAP = SU;
2654 }
2655 R.insert(maxASAP);
2656 Order = BottomUp;
2657 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2658 }
2659
2660 while (!R.empty()) {
2661 if (Order == TopDown) {
2662 // Choose the node with the maximum height. If more than one, choose
2663 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2664 // choose the node with the lowest MOV.
2665 while (!R.empty()) {
2666 SUnit *maxHeight = nullptr;
2667 for (SUnit *I : R) {
2668 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2669 maxHeight = I;
2670 else if (getHeight(I) == getHeight(maxHeight) &&
2671 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2672 maxHeight = I;
2673 else if (getHeight(I) == getHeight(maxHeight) &&
2674 getZeroLatencyHeight(I) ==
2675 getZeroLatencyHeight(maxHeight) &&
2676 getMOV(I) < getMOV(maxHeight))
2677 maxHeight = I;
2678 }
2679 NodeOrder.insert(maxHeight);
2680 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2681 R.remove(maxHeight);
2682 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2683 SUnit *SU = OE.getDst();
2684 if (Nodes.count(SU) == 0)
2685 continue;
2686 if (NodeOrder.contains(SU))
2687 continue;
2688 if (OE.ignoreDependence(false))
2689 continue;
2690 R.insert(SU);
2691 }
2692
2693 // FIXME: The following loop-carried dependencies may also need to be
2694 // considered.
2695 // - Physical register dependnecies (true-dependnece and WAW).
2696 // - Memory dependencies.
2697 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2698 SUnit *SU = IE.getSrc();
2699 if (!IE.isAntiDep())
2700 continue;
2701 if (Nodes.count(SU) == 0)
2702 continue;
2703 if (NodeOrder.contains(SU))
2704 continue;
2705 R.insert(SU);
2706 }
2707 }
2708 Order = BottomUp;
2709 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2710 SmallSetVector<SUnit *, 8> N;
2711 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2712 R.insert_range(N);
2713 } else {
2714 // Choose the node with the maximum depth. If more than one, choose
2715 // the node with the maximum ZeroLatencyDepth. If still more than one,
2716 // choose the node with the lowest MOV.
2717 while (!R.empty()) {
2718 SUnit *maxDepth = nullptr;
2719 for (SUnit *I : R) {
2720 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2721 maxDepth = I;
2722 else if (getDepth(I) == getDepth(maxDepth) &&
2723 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2724 maxDepth = I;
2725 else if (getDepth(I) == getDepth(maxDepth) &&
2726 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2727 getMOV(I) < getMOV(maxDepth))
2728 maxDepth = I;
2729 }
2730 NodeOrder.insert(maxDepth);
2731 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2732 R.remove(maxDepth);
2733 if (Nodes.isExceedSU(maxDepth)) {
2734 Order = TopDown;
2735 R.clear();
2736 R.insert(Nodes.getNode(0));
2737 break;
2738 }
2739 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2740 SUnit *SU = IE.getSrc();
2741 if (Nodes.count(SU) == 0)
2742 continue;
2743 if (NodeOrder.contains(SU))
2744 continue;
2745 R.insert(SU);
2746 }
2747
2748 // FIXME: The following loop-carried dependencies may also need to be
2749 // considered.
2750 // - Physical register dependnecies (true-dependnece and WAW).
2751 // - Memory dependencies.
2752 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2753 SUnit *SU = OE.getDst();
2754 if (!OE.isAntiDep())
2755 continue;
2756 if (Nodes.count(SU) == 0)
2757 continue;
2758 if (NodeOrder.contains(SU))
2759 continue;
2760 R.insert(SU);
2761 }
2762 }
2763 Order = TopDown;
2764 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2765 SmallSetVector<SUnit *, 8> N;
2766 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2767 R.insert_range(N);
2768 }
2769 }
2770 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2771 }
2772
2773 LLVM_DEBUG({
2774 dbgs() << "Node order: ";
2775 for (SUnit *I : NodeOrder)
2776 dbgs() << " " << I->NodeNum << " ";
2777 dbgs() << "\n";
2778 });
2779}
2780
2781/// Process the nodes in the computed order and create the pipelined schedule
2782/// of the instructions, if possible. Return true if a schedule is found.
2783bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2784
2785 if (NodeOrder.empty()){
2786 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2787 return false;
2788 }
2789
2790 bool scheduleFound = false;
2791 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2792 if (LimitRegPressure) {
2793 HRPDetector =
2794 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2795 HRPDetector->init(RegClassInfo);
2796 }
2797 // Keep increasing II until a valid schedule is found.
2798 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2799 Schedule.reset();
2800 Schedule.setInitiationInterval(II);
2801 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2802
2805 do {
2806 SUnit *SU = *NI;
2807
2808 // Compute the schedule time for the instruction, which is based
2809 // upon the scheduled time for any predecessors/successors.
2810 int EarlyStart = INT_MIN;
2811 int LateStart = INT_MAX;
2812 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2813 LLVM_DEBUG({
2814 dbgs() << "\n";
2815 dbgs() << "Inst (" << SU->NodeNum << ") ";
2816 SU->getInstr()->dump();
2817 dbgs() << "\n";
2818 });
2819 LLVM_DEBUG(
2820 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2821
2822 if (EarlyStart > LateStart)
2823 scheduleFound = false;
2824 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2825 scheduleFound =
2826 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2827 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2828 scheduleFound =
2829 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2830 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2831 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2832 // When scheduling a Phi it is better to start at the late cycle and
2833 // go backwards. The default order may insert the Phi too far away
2834 // from its first dependence.
2835 // Also, do backward search when all scheduled predecessors are
2836 // loop-carried output/order dependencies. Empirically, there are also
2837 // cases where scheduling becomes possible with backward search.
2838 if (SU->getInstr()->isPHI() ||
2839 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2840 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2841 else
2842 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2843 } else {
2844 int FirstCycle = Schedule.getFirstCycle();
2845 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2846 FirstCycle + getASAP(SU) + II - 1, II);
2847 }
2848
2849 // Even if we find a schedule, make sure the schedule doesn't exceed the
2850 // allowable number of stages. We keep trying if this happens.
2851 if (scheduleFound)
2852 if (SwpMaxStages > -1 &&
2853 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2854 scheduleFound = false;
2855
2856 LLVM_DEBUG({
2857 if (!scheduleFound)
2858 dbgs() << "\tCan't schedule\n";
2859 });
2860 } while (++NI != NE && scheduleFound);
2861
2862 // If a schedule is found, validate it against the validation-only
2863 // dependencies.
2864 if (scheduleFound)
2865 scheduleFound = DDG->isValidSchedule(Schedule);
2866
2867 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2868 if (scheduleFound)
2869 scheduleFound =
2870 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2871
2872 // If a schedule is found, check if it is a valid schedule too.
2873 if (scheduleFound)
2874 scheduleFound = Schedule.isValidSchedule(this);
2875
2876 // If a schedule was found and the option is enabled, check if the schedule
2877 // might generate additional register spills/fills.
2878 if (scheduleFound && LimitRegPressure)
2879 scheduleFound =
2880 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2881 }
2882
2883 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2884 << " (II=" << Schedule.getInitiationInterval()
2885 << ")\n");
2886
2887 if (scheduleFound) {
2888 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2889 if (!scheduleFound)
2890 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2891 }
2892
2893 if (scheduleFound) {
2894 Schedule.finalizeSchedule(this);
2895 Pass.ORE->emit([&]() {
2896 return MachineOptimizationRemarkAnalysis(
2897 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2898 << "Schedule found with Initiation Interval: "
2899 << ore::NV("II", Schedule.getInitiationInterval())
2900 << ", MaxStageCount: "
2901 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2902 });
2903 } else
2904 Schedule.reset();
2905
2906 return scheduleFound && Schedule.getMaxStageCount() > 0;
2907}
2908
2910 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2911 Register Result;
2912 for (const MachineOperand &Use : MI.all_uses()) {
2913 Register Reg = Use.getReg();
2914 if (!Reg.isVirtual())
2915 return Register();
2916 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2917 continue;
2918 if (Result)
2919 return Register();
2920 Result = Reg;
2921 }
2922 return Result;
2923}
2924
2925/// When Op is a value that is incremented recursively in a loop and there is a
2926/// unique instruction that increments it, returns true and sets Value.
2928 if (!Op.isReg() || !Op.getReg().isVirtual())
2929 return false;
2930
2931 Register OrgReg = Op.getReg();
2932 Register CurReg = OrgReg;
2933 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2934 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2935
2936 const TargetInstrInfo *TII =
2937 LoopBB->getParent()->getSubtarget().getInstrInfo();
2938 const TargetRegisterInfo *TRI =
2939 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2940
2941 MachineInstr *Phi = nullptr;
2942 MachineInstr *Increment = nullptr;
2943
2944 // Traverse definitions until it reaches Op or an instruction that does not
2945 // satisfy the condition.
2946 // Acceptable example:
2947 // bb.0:
2948 // %0 = PHI %3, %bb.0, ...
2949 // %2 = ADD %0, Value
2950 // ... = LOAD %2(Op)
2951 // %3 = COPY %2
2952 while (true) {
2953 if (!CurReg.isValid() || !CurReg.isVirtual())
2954 return false;
2955 MachineInstr *Def = MRI.getVRegDef(CurReg);
2956 if (Def->getParent() != LoopBB)
2957 return false;
2958
2959 if (Def->isCopy()) {
2960 // Ignore copy instructions unless they contain subregisters
2961 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2962 return false;
2963 CurReg = Def->getOperand(1).getReg();
2964 } else if (Def->isPHI()) {
2965 // There must be just one Phi
2966 if (Phi)
2967 return false;
2968 Phi = Def;
2969 CurReg = getLoopPhiReg(*Def, LoopBB);
2970 } else if (TII->getIncrementValue(*Def, Value)) {
2971 // Potentially a unique increment
2972 if (Increment)
2973 // Multiple increments exist
2974 return false;
2975
2976 const MachineOperand *BaseOp;
2977 int64_t Offset;
2978 bool OffsetIsScalable;
2979 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2980 TRI)) {
2981 // Pre/post increment instruction
2982 CurReg = BaseOp->getReg();
2983 } else {
2984 // If only one of the operands is defined within the loop, it is assumed
2985 // to be an incremented value.
2986 CurReg = findUniqueOperandDefinedInLoop(*Def);
2987 if (!CurReg.isValid())
2988 return false;
2989 }
2990 Increment = Def;
2991 } else {
2992 return false;
2993 }
2994 if (CurReg == OrgReg)
2995 break;
2996 }
2997
2998 if (!Phi || !Increment)
2999 return false;
3000
3001 return true;
3002}
3003
3004/// Return true if we can compute the amount the instruction changes
3005/// during each iteration. Set Delta to the amount of the change.
3006bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
3007 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3008 const MachineOperand *BaseOp;
3009 int64_t Offset;
3010 bool OffsetIsScalable;
3011 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
3012 return false;
3013
3014 // FIXME: This algorithm assumes instructions have fixed-size offsets.
3015 if (OffsetIsScalable)
3016 return false;
3017
3018 if (!BaseOp->isReg())
3019 return false;
3020
3021 return findLoopIncrementValue(*BaseOp, Delta);
3022}
3023
3024/// Check if we can change the instruction to use an offset value from the
3025/// previous iteration. If so, return true and set the base and offset values
3026/// so that we can rewrite the load, if necessary.
3027/// v1 = Phi(v0, v3)
3028/// v2 = load v1, 0
3029/// v3 = post_store v1, 4, x
3030/// This function enables the load to be rewritten as v2 = load v3, 4.
3031bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3032 unsigned &BasePos,
3033 unsigned &OffsetPos,
3034 Register &NewBase,
3035 int64_t &Offset) {
3036 // Get the load instruction.
3037 if (TII->isPostIncrement(*MI))
3038 return false;
3039 unsigned BasePosLd, OffsetPosLd;
3040 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3041 return false;
3042 Register BaseReg = MI->getOperand(BasePosLd).getReg();
3043
3044 // Look for the Phi instruction.
3045 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3046 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3047 if (!Phi || !Phi->isPHI())
3048 return false;
3049 // Get the register defined in the loop block.
3050 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3051 if (!PrevReg)
3052 return false;
3053
3054 // Check for the post-increment load/store instruction.
3055 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3056 if (!PrevDef || PrevDef == MI)
3057 return false;
3058
3059 if (!TII->isPostIncrement(*PrevDef))
3060 return false;
3061
3062 unsigned BasePos1 = 0, OffsetPos1 = 0;
3063 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3064 return false;
3065
3066 // Make sure that the instructions do not access the same memory location in
3067 // the next iteration.
3068 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3069 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3070 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3071 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3072 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3073 MF.deleteMachineInstr(NewMI);
3074 if (!Disjoint)
3075 return false;
3076
3077 // Set the return value once we determine that we return true.
3078 BasePos = BasePosLd;
3079 OffsetPos = OffsetPosLd;
3080 NewBase = PrevReg;
3081 Offset = StoreOffset;
3082 return true;
3083}
3084
3085/// Apply changes to the instruction if needed. The changes are need
3086/// to improve the scheduling and depend up on the final schedule.
3088 SMSchedule &Schedule) {
3089 SUnit *SU = getSUnit(MI);
3091 InstrChanges.find(SU);
3092 if (It != InstrChanges.end()) {
3093 std::pair<Register, int64_t> RegAndOffset = It->second;
3094 unsigned BasePos, OffsetPos;
3095 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3096 return;
3097 Register BaseReg = MI->getOperand(BasePos).getReg();
3098 MachineInstr *LoopDef = findDefInLoop(BaseReg);
3099 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3100 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3101 int BaseStageNum = Schedule.stageScheduled(SU);
3102 int BaseCycleNum = Schedule.cycleScheduled(SU);
3103 if (BaseStageNum < DefStageNum) {
3104 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3105 int OffsetDiff = DefStageNum - BaseStageNum;
3106 if (DefCycleNum < BaseCycleNum) {
3107 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3108 if (OffsetDiff > 0)
3109 --OffsetDiff;
3110 }
3111 int64_t NewOffset =
3112 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3113 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3114 SU->setInstr(NewMI);
3115 MISUnitMap[NewMI] = SU;
3116 NewMIs[MI] = NewMI;
3117 }
3118 }
3119}
3120
3121/// Return the instruction in the loop that defines the register.
3122/// If the definition is a Phi, then follow the Phi operand to
3123/// the instruction in the loop.
3124MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3126 MachineInstr *Def = MRI.getVRegDef(Reg);
3127 while (Def->isPHI()) {
3128 if (!Visited.insert(Def).second)
3129 break;
3130 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3131 if (Def->getOperand(i + 1).getMBB() == BB) {
3132 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3133 break;
3134 }
3135 }
3136 return Def;
3137}
3138
3139/// Return false if there is no overlap between the region accessed by BaseMI in
3140/// an iteration and the region accessed by OtherMI in subsequent iterations.
3142 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3143 int DeltaB, DeltaO, Delta;
3144 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3145 DeltaB != DeltaO)
3146 return true;
3147 Delta = DeltaB;
3148
3149 const MachineOperand *BaseOpB, *BaseOpO;
3150 int64_t OffsetB, OffsetO;
3151 bool OffsetBIsScalable, OffsetOIsScalable;
3152 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3153 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3154 OffsetBIsScalable, TRI) ||
3155 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3156 OffsetOIsScalable, TRI))
3157 return true;
3158
3159 if (OffsetBIsScalable || OffsetOIsScalable)
3160 return true;
3161
3162 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3163 // Pass cases with different base operands but same initial values.
3164 // Typically for when pre/post increment is used.
3165
3166 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3167 return true;
3168 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3169 if (!RegB.isVirtual() || !RegO.isVirtual())
3170 return true;
3171
3172 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3173 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3174 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3175 return true;
3176
3177 Register InitValB;
3178 Register LoopValB;
3179 Register InitValO;
3180 Register LoopValO;
3181 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3182 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3183 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3184 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3185
3186 if (!InitDefB->isIdenticalTo(*InitDefO))
3187 return true;
3188 }
3189
3190 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3191 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3192
3193 // This is the main test, which checks the offset values and the loop
3194 // increment value to determine if the accesses may be loop carried.
3195 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3196 return true;
3197
3198 LLVM_DEBUG({
3199 dbgs() << "Overlap check:\n";
3200 dbgs() << " BaseMI: ";
3201 BaseMI->dump();
3202 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3203 << ", Len: " << AccessSizeB.getValue() << "\n";
3204 dbgs() << " OtherMI: ";
3205 OtherMI->dump();
3206 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3207 << ", Len: " << AccessSizeO.getValue() << "\n";
3208 });
3209
3210 // Excessive overlap may be detected in strided patterns.
3211 // For example, the memory addresses of the store and the load in
3212 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3213 // are assumed to overlap.
3214 if (Delta < 0) {
3215 int64_t BaseMinAddr = OffsetB;
3216 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3217 if (BaseMinAddr > OhterNextIterMaxAddr) {
3218 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3219 return false;
3220 }
3221 } else {
3222 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3223 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3224 if (BaseMaxAddr < OtherNextIterMinAddr) {
3225 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3226 return false;
3227 }
3228 }
3229 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3230 return true;
3231}
3232
3233void SwingSchedulerDAG::postProcessDAG() {
3234 for (auto &M : Mutations)
3235 M->apply(this);
3236}
3237
3238/// Try to schedule the node at the specified StartCycle and continue
3239/// until the node is schedule or the EndCycle is reached. This function
3240/// returns true if the node is scheduled. This routine may search either
3241/// forward or backward for a place to insert the instruction based upon
3242/// the relative values of StartCycle and EndCycle.
3243bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3244 bool forward = true;
3245 LLVM_DEBUG({
3246 dbgs() << "Trying to insert node between " << StartCycle << " and "
3247 << EndCycle << " II: " << II << "\n";
3248 });
3249 if (StartCycle > EndCycle)
3250 forward = false;
3251
3252 // The terminating condition depends on the direction.
3253 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3254 for (int curCycle = StartCycle; curCycle != termCycle;
3255 forward ? ++curCycle : --curCycle) {
3256
3257 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3258 ProcItinResources.canReserveResources(*SU, curCycle)) {
3259 LLVM_DEBUG({
3260 dbgs() << "\tinsert at cycle " << curCycle << " ";
3261 SU->getInstr()->dump();
3262 });
3263
3264 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3265 ProcItinResources.reserveResources(*SU, curCycle);
3266 ScheduledInstrs[curCycle].push_back(SU);
3267 InstrToCycle.insert(std::make_pair(SU, curCycle));
3268 if (curCycle > LastCycle)
3269 LastCycle = curCycle;
3270 if (curCycle < FirstCycle)
3271 FirstCycle = curCycle;
3272 return true;
3273 }
3274 LLVM_DEBUG({
3275 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3276 SU->getInstr()->dump();
3277 });
3278 }
3279 return false;
3280}
3281
3282/// If an instruction has a use that spans multiple iterations, then
3283/// return true. These instructions are characterized by having a back-ege
3284/// to a Phi, which contains a reference to another Phi.
3286 for (auto &P : SU->Preds)
3287 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3288 for (auto &S : P.getSUnit()->Succs)
3289 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3290 return P.getSUnit();
3291 return nullptr;
3292}
3293
3294/// Compute the scheduling start slot for the instruction. The start slot
3295/// depends on any predecessor or successor nodes scheduled already.
3296void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3297 int II, SwingSchedulerDAG *DAG) {
3298 const SwingSchedulerDDG *DDG = DAG->getDDG();
3299
3300 // Iterate over each instruction that has been scheduled already. The start
3301 // slot computation depends on whether the previously scheduled instruction
3302 // is a predecessor or successor of the specified instruction.
3303 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3304 for (SUnit *I : getInstructions(cycle)) {
3305 for (const auto &IE : DDG->getInEdges(SU)) {
3306 if (IE.getSrc() == I) {
3307 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3308 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3309 }
3310 }
3311
3312 for (const auto &OE : DDG->getOutEdges(SU)) {
3313 if (OE.getDst() == I) {
3314 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3315 *MinLateStart = std::min(*MinLateStart, LateStart);
3316 }
3317 }
3318
3319 SUnit *BE = multipleIterations(I, DAG);
3320 for (const auto &Dep : SU->Preds) {
3321 // For instruction that requires multiple iterations, make sure that
3322 // the dependent instruction is not scheduled past the definition.
3323 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3324 !SU->isPred(I))
3325 *MinLateStart = std::min(*MinLateStart, cycle);
3326 }
3327 }
3328 }
3329}
3330
3331/// Order the instructions within a cycle so that the definitions occur
3332/// before the uses. Returns true if the instruction is added to the start
3333/// of the list, or false if added to the end.
3335 std::deque<SUnit *> &Insts) const {
3336 MachineInstr *MI = SU->getInstr();
3337 bool OrderBeforeUse = false;
3338 bool OrderAfterDef = false;
3339 bool OrderBeforeDef = false;
3340 unsigned MoveDef = 0;
3341 unsigned MoveUse = 0;
3342 int StageInst1 = stageScheduled(SU);
3343 const SwingSchedulerDDG *DDG = SSD->getDDG();
3344
3345 unsigned Pos = 0;
3346 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3347 ++I, ++Pos) {
3348 for (MachineOperand &MO : MI->operands()) {
3349 if (!MO.isReg() || !MO.getReg().isVirtual())
3350 continue;
3351
3352 Register Reg = MO.getReg();
3353 unsigned BasePos, OffsetPos;
3354 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3355 if (MI->getOperand(BasePos).getReg() == Reg)
3356 if (Register NewReg = SSD->getInstrBaseReg(SU))
3357 Reg = NewReg;
3358 bool Reads, Writes;
3359 std::tie(Reads, Writes) =
3360 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3361 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3362 OrderBeforeUse = true;
3363 if (MoveUse == 0)
3364 MoveUse = Pos;
3365 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3366 // Add the instruction after the scheduled instruction.
3367 OrderAfterDef = true;
3368 MoveDef = Pos;
3369 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3370 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3371 OrderBeforeUse = true;
3372 if (MoveUse == 0)
3373 MoveUse = Pos;
3374 } else {
3375 OrderAfterDef = true;
3376 MoveDef = Pos;
3377 }
3378 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3379 OrderBeforeUse = true;
3380 if (MoveUse == 0)
3381 MoveUse = Pos;
3382 if (MoveUse != 0) {
3383 OrderAfterDef = true;
3384 MoveDef = Pos - 1;
3385 }
3386 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3387 // Add the instruction before the scheduled instruction.
3388 OrderBeforeUse = true;
3389 if (MoveUse == 0)
3390 MoveUse = Pos;
3391 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3392 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3393 if (MoveUse == 0) {
3394 OrderBeforeDef = true;
3395 MoveUse = Pos;
3396 }
3397 }
3398 }
3399 // Check for order dependences between instructions. Make sure the source
3400 // is ordered before the destination.
3401 for (auto &OE : DDG->getOutEdges(SU)) {
3402 if (OE.getDst() != *I)
3403 continue;
3404 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3405 OrderBeforeUse = true;
3406 if (Pos < MoveUse)
3407 MoveUse = Pos;
3408 }
3409 // We did not handle HW dependences in previous for loop,
3410 // and we normally set Latency = 0 for Anti/Output deps,
3411 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3412 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3413 stageScheduled(*I) == StageInst1) {
3414 OrderBeforeUse = true;
3415 if ((MoveUse == 0) || (Pos < MoveUse))
3416 MoveUse = Pos;
3417 }
3418 }
3419 for (auto &IE : DDG->getInEdges(SU)) {
3420 if (IE.getSrc() != *I)
3421 continue;
3422 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3423 stageScheduled(*I) == StageInst1) {
3424 OrderAfterDef = true;
3425 MoveDef = Pos;
3426 }
3427 }
3428 }
3429
3430 // A circular dependence.
3431 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3432 OrderBeforeUse = false;
3433
3434 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3435 // to a loop-carried dependence.
3436 if (OrderBeforeDef)
3437 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3438
3439 // The uncommon case when the instruction order needs to be updated because
3440 // there is both a use and def.
3441 if (OrderBeforeUse && OrderAfterDef) {
3442 SUnit *UseSU = Insts.at(MoveUse);
3443 SUnit *DefSU = Insts.at(MoveDef);
3444 if (MoveUse > MoveDef) {
3445 Insts.erase(Insts.begin() + MoveUse);
3446 Insts.erase(Insts.begin() + MoveDef);
3447 } else {
3448 Insts.erase(Insts.begin() + MoveDef);
3449 Insts.erase(Insts.begin() + MoveUse);
3450 }
3451 orderDependence(SSD, UseSU, Insts);
3452 orderDependence(SSD, SU, Insts);
3453 orderDependence(SSD, DefSU, Insts);
3454 return;
3455 }
3456 // Put the new instruction first if there is a use in the list. Otherwise,
3457 // put it at the end of the list.
3458 if (OrderBeforeUse)
3459 Insts.push_front(SU);
3460 else
3461 Insts.push_back(SU);
3462}
3463
3464/// Return true if the scheduled Phi has a loop carried operand.
3466 MachineInstr &Phi) const {
3467 if (!Phi.isPHI())
3468 return false;
3469 assert(Phi.isPHI() && "Expecting a Phi.");
3470 SUnit *DefSU = SSD->getSUnit(&Phi);
3471 unsigned DefCycle = cycleScheduled(DefSU);
3472 int DefStage = stageScheduled(DefSU);
3473
3474 Register InitVal;
3475 Register LoopVal;
3476 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3477 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3478 if (!UseSU)
3479 return true;
3480 if (UseSU->getInstr()->isPHI())
3481 return true;
3482 unsigned LoopCycle = cycleScheduled(UseSU);
3483 int LoopStage = stageScheduled(UseSU);
3484 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3485}
3486
3487/// Return true if the instruction is a definition that is loop carried
3488/// and defines the use on the next iteration.
3489/// v1 = phi(v2, v3)
3490/// (Def) v3 = op v1
3491/// (MO) = v1
3492/// If MO appears before Def, then v1 and v3 may get assigned to the same
3493/// register.
3495 MachineInstr *Def,
3496 MachineOperand &MO) const {
3497 if (!MO.isReg())
3498 return false;
3499 if (Def->isPHI())
3500 return false;
3501 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3502 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3503 return false;
3504 if (!isLoopCarried(SSD, *Phi))
3505 return false;
3506 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3507 for (MachineOperand &DMO : Def->all_defs()) {
3508 if (DMO.getReg() == LoopReg)
3509 return true;
3510 }
3511 return false;
3512}
3513
3514/// Return true if all scheduled predecessors are loop-carried output/order
3515/// dependencies.
3517 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3518 for (const auto &IE : DDG->getInEdges(SU))
3519 if (InstrToCycle.count(IE.getSrc()))
3520 return false;
3521 return true;
3522}
3523
3524/// Determine transitive dependences of unpipelineable instructions
3527 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3528 SmallVector<SUnit *, 8> Worklist;
3529
3530 for (auto &SU : SSD->SUnits)
3531 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3532 Worklist.push_back(&SU);
3533
3534 const SwingSchedulerDDG *DDG = SSD->getDDG();
3535 while (!Worklist.empty()) {
3536 auto SU = Worklist.pop_back_val();
3537 if (DoNotPipeline.count(SU))
3538 continue;
3539 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3540 DoNotPipeline.insert(SU);
3541 for (const auto &IE : DDG->getInEdges(SU))
3542 Worklist.push_back(IE.getSrc());
3543
3544 // To preserve previous behavior and prevent regression
3545 // FIXME: Remove if this doesn't have significant impact on
3546 for (const auto &OE : DDG->getOutEdges(SU))
3547 if (OE.getDistance() == 1)
3548 Worklist.push_back(OE.getDst());
3549 }
3550 return DoNotPipeline;
3551}
3552
3553// Determine all instructions upon which any unpipelineable instruction depends
3554// and ensure that they are in stage 0. If unable to do so, return false.
3558
3559 int NewLastCycle = INT_MIN;
3560 for (SUnit &SU : SSD->SUnits) {
3561 if (!SU.isInstr())
3562 continue;
3563 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3564 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3565 continue;
3566 }
3567
3568 // Put the non-pipelined instruction as early as possible in the schedule
3569 int NewCycle = getFirstCycle();
3570 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3571 if (IE.getDistance() == 0)
3572 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3573
3574 // To preserve previous behavior and prevent regression
3575 // FIXME: Remove if this doesn't have significant impact on performance
3576 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3577 if (OE.getDistance() == 1)
3578 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3579
3580 int OldCycle = InstrToCycle[&SU];
3581 if (OldCycle != NewCycle) {
3582 InstrToCycle[&SU] = NewCycle;
3583 auto &OldS = getInstructions(OldCycle);
3584 llvm::erase(OldS, &SU);
3585 getInstructions(NewCycle).emplace_back(&SU);
3586 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3587 << ") is not pipelined; moving from cycle " << OldCycle
3588 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3589 }
3590
3591 // We traverse the SUs in the order of the original basic block. Computing
3592 // NewCycle in this order normally works fine because all dependencies
3593 // (except for loop-carried dependencies) don't violate the original order.
3594 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3595 // break it. That is, there may be exist an artificial dependency from
3596 // bottom to top. In such a case, NewCycle may become too large to be
3597 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3598 // following case:
3599 //
3600 // | Inst0 <-+
3601 // SU order | | artificial dep
3602 // | Inst1 --+
3603 // v
3604 //
3605 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3606 // Inst0 must be greater than or equal to N so that Inst0 is not be
3607 // scheduled at Stage 0. In such cases, we reject this schedule at this
3608 // time.
3609 // FIXME: The reason for this is the existence of artificial dependencies
3610 // that are contradict to the original SU order. If ignoring artificial
3611 // dependencies does not affect correctness, then it is better to ignore
3612 // them.
3613 if (FirstCycle + InitiationInterval <= NewCycle)
3614 return false;
3615
3616 NewLastCycle = std::max(NewLastCycle, NewCycle);
3617 }
3618 LastCycle = NewLastCycle;
3619 return true;
3620}
3621
3622// Check if the generated schedule is valid. This function checks if
3623// an instruction that uses a physical register is scheduled in a
3624// different stage than the definition. The pipeliner does not handle
3625// physical register values that may cross a basic block boundary.
3626// Furthermore, if a physical def/use pair is assigned to the same
3627// cycle, orderDependence does not guarantee def/use ordering, so that
3628// case should be considered invalid. (The test checks for both
3629// earlier and same-cycle use to be more robust.)
3631 for (SUnit &SU : SSD->SUnits) {
3632 if (!SU.hasPhysRegDefs)
3633 continue;
3634 int StageDef = stageScheduled(&SU);
3635 int CycleDef = InstrToCycle[&SU];
3636 assert(StageDef != -1 && "Instruction should have been scheduled.");
3637 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3638 SUnit *Dst = OE.getDst();
3639 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3640 if (OE.getReg().isPhysical()) {
3641 if (stageScheduled(Dst) != StageDef)
3642 return false;
3643 if (InstrToCycle[Dst] <= CycleDef)
3644 return false;
3645 }
3646 }
3647 }
3648 return true;
3649}
3650
3651/// A property of the node order in swing-modulo-scheduling is
3652/// that for nodes outside circuits the following holds:
3653/// none of them is scheduled after both a successor and a
3654/// predecessor.
3655/// The method below checks whether the property is met.
3656/// If not, debug information is printed and statistics information updated.
3657/// Note that we do not use an assert statement.
3658/// The reason is that although an invalid node order may prevent
3659/// the pipeliner from finding a pipelined schedule for arbitrary II,
3660/// it does not lead to the generation of incorrect code.
3661void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3662
3663 // a sorted vector that maps each SUnit to its index in the NodeOrder
3664 typedef std::pair<SUnit *, unsigned> UnitIndex;
3665 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3666
3667 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3668 Indices.push_back(std::make_pair(NodeOrder[i], i));
3669
3670 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3671 return std::get<0>(i1) < std::get<0>(i2);
3672 };
3673
3674 // sort, so that we can perform a binary search
3675 llvm::sort(Indices, CompareKey);
3676
3677 bool Valid = true;
3678 (void)Valid;
3679 // for each SUnit in the NodeOrder, check whether
3680 // it appears after both a successor and a predecessor
3681 // of the SUnit. If this is the case, and the SUnit
3682 // is not part of circuit, then the NodeOrder is not
3683 // valid.
3684 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3685 SUnit *SU = NodeOrder[i];
3686 unsigned Index = i;
3687
3688 bool PredBefore = false;
3689 bool SuccBefore = false;
3690
3691 SUnit *Succ;
3692 SUnit *Pred;
3693 (void)Succ;
3694 (void)Pred;
3695
3696 for (const auto &IE : DDG->getInEdges(SU)) {
3697 SUnit *PredSU = IE.getSrc();
3698 unsigned PredIndex = std::get<1>(
3699 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3700 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3701 PredBefore = true;
3702 Pred = PredSU;
3703 break;
3704 }
3705 }
3706
3707 for (const auto &OE : DDG->getOutEdges(SU)) {
3708 SUnit *SuccSU = OE.getDst();
3709 // Do not process a boundary node, it was not included in NodeOrder,
3710 // hence not in Indices either, call to std::lower_bound() below will
3711 // return Indices.end().
3712 if (SuccSU->isBoundaryNode())
3713 continue;
3714 unsigned SuccIndex = std::get<1>(
3715 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3716 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3717 SuccBefore = true;
3718 Succ = SuccSU;
3719 break;
3720 }
3721 }
3722
3723 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3724 // instructions in circuits are allowed to be scheduled
3725 // after both a successor and predecessor.
3726 bool InCircuit = llvm::any_of(
3727 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3728 if (InCircuit)
3729 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3730 else {
3731 Valid = false;
3732 NumNodeOrderIssues++;
3733 LLVM_DEBUG(dbgs() << "Predecessor ");
3734 }
3735 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3736 << " are scheduled before node " << SU->NodeNum
3737 << "\n");
3738 }
3739 }
3740
3741 LLVM_DEBUG({
3742 if (!Valid)
3743 dbgs() << "Invalid node order found!\n";
3744 });
3745}
3746
3747/// Attempt to fix the degenerate cases when the instruction serialization
3748/// causes the register lifetimes to overlap. For example,
3749/// p' = store_pi(p, b)
3750/// = load p, offset
3751/// In this case p and p' overlap, which means that two registers are needed.
3752/// Instead, this function changes the load to use p' and updates the offset.
3753void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3754 Register OverlapReg;
3755 Register NewBaseReg;
3756 for (SUnit *SU : Instrs) {
3757 MachineInstr *MI = SU->getInstr();
3758 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3759 const MachineOperand &MO = MI->getOperand(i);
3760 // Look for an instruction that uses p. The instruction occurs in the
3761 // same cycle but occurs later in the serialized order.
3762 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3763 // Check that the instruction appears in the InstrChanges structure,
3764 // which contains instructions that can have the offset updated.
3766 InstrChanges.find(SU);
3767 if (It != InstrChanges.end()) {
3768 unsigned BasePos, OffsetPos;
3769 // Update the base register and adjust the offset.
3770 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3771 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3772 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3773 int64_t NewOffset =
3774 MI->getOperand(OffsetPos).getImm() - It->second.second;
3775 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3776 SU->setInstr(NewMI);
3777 MISUnitMap[NewMI] = SU;
3778 NewMIs[MI] = NewMI;
3779 }
3780 }
3781 OverlapReg = Register();
3782 NewBaseReg = Register();
3783 break;
3784 }
3785 // Look for an instruction of the form p' = op(p), which uses and defines
3786 // two virtual registers that get allocated to the same physical register.
3787 unsigned TiedUseIdx = 0;
3788 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3789 // OverlapReg is p in the example above.
3790 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3791 // NewBaseReg is p' in the example above.
3792 NewBaseReg = MI->getOperand(i).getReg();
3793 break;
3794 }
3795 }
3796 }
3797}
3798
3799std::deque<SUnit *>
3801 const std::deque<SUnit *> &Instrs) const {
3802 std::deque<SUnit *> NewOrderPhi;
3803 for (SUnit *SU : Instrs) {
3804 if (SU->getInstr()->isPHI())
3805 NewOrderPhi.push_back(SU);
3806 }
3807 std::deque<SUnit *> NewOrderI;
3808 for (SUnit *SU : Instrs) {
3809 if (!SU->getInstr()->isPHI())
3810 orderDependence(SSD, SU, NewOrderI);
3811 }
3812 llvm::append_range(NewOrderPhi, NewOrderI);
3813 return NewOrderPhi;
3814}
3815
3816/// After the schedule has been formed, call this function to combine
3817/// the instructions from the different stages/cycles. That is, this
3818/// function creates a schedule that represents a single iteration.
3820 // Move all instructions to the first stage from later stages.
3821 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3822 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3823 ++stage) {
3824 std::deque<SUnit *> &cycleInstrs =
3825 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3826 for (SUnit *SU : llvm::reverse(cycleInstrs))
3827 ScheduledInstrs[cycle].push_front(SU);
3828 }
3829 }
3830
3831 // Erase all the elements in the later stages. Only one iteration should
3832 // remain in the scheduled list, and it contains all the instructions.
3833 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3834 ScheduledInstrs.erase(cycle);
3835
3836 // Change the registers in instruction as specified in the InstrChanges
3837 // map. We need to use the new registers to create the correct order.
3838 for (const SUnit &SU : SSD->SUnits)
3839 SSD->applyInstrChange(SU.getInstr(), *this);
3840
3841 // Reorder the instructions in each cycle to fix and improve the
3842 // generated code.
3843 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3844 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3845 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3846 SSD->fixupRegisterOverlaps(cycleInstrs);
3847 }
3848
3849 LLVM_DEBUG(dump(););
3850}
3851
3853 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3854 << " depth " << MaxDepth << " col " << Colocate << "\n";
3855 for (const auto &I : Nodes)
3856 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3857 os << "\n";
3858}
3859
3860#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3861/// Print the schedule information to the given output.
3863 // Iterate over each cycle.
3864 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3865 // Iterate over each instruction in the cycle.
3866 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3867 for (SUnit *CI : cycleInstrs->second) {
3868 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3869 os << "(" << CI->NodeNum << ") ";
3870 CI->getInstr()->print(os);
3871 os << "\n";
3872 }
3873 }
3874}
3875
3876/// Utility function used for debugging to print the schedule.
3879
3880void ResourceManager::dumpMRT() const {
3881 LLVM_DEBUG({
3882 if (UseDFA)
3883 return;
3884 std::stringstream SS;
3885 SS << "MRT:\n";
3886 SS << std::setw(4) << "Slot";
3887 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3888 SS << std::setw(3) << I;
3889 SS << std::setw(7) << "#Mops"
3890 << "\n";
3891 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3892 SS << std::setw(4) << Slot;
3893 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3894 SS << std::setw(3) << MRT[Slot][I];
3895 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3896 }
3897 dbgs() << SS.str();
3898 });
3899}
3900#endif
3901
3903 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3904 unsigned ProcResourceID = 0;
3905
3906 // We currently limit the resource kinds to 64 and below so that we can use
3907 // uint64_t for Masks
3908 assert(SM.getNumProcResourceKinds() < 64 &&
3909 "Too many kinds of resources, unsupported");
3910 // Create a unique bitmask for every processor resource unit.
3911 // Skip resource at index 0, since it always references 'InvalidUnit'.
3912 Masks.resize(SM.getNumProcResourceKinds());
3913 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3914 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3915 if (Desc.SubUnitsIdxBegin)
3916 continue;
3917 Masks[I] = 1ULL << ProcResourceID;
3918 ProcResourceID++;
3919 }
3920 // Create a unique bitmask for every processor resource group.
3921 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3922 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3923 if (!Desc.SubUnitsIdxBegin)
3924 continue;
3925 Masks[I] = 1ULL << ProcResourceID;
3926 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3927 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3928 ProcResourceID++;
3929 }
3930 LLVM_DEBUG({
3931 if (SwpShowResMask) {
3932 dbgs() << "ProcResourceDesc:\n";
3933 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3934 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3935 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3936 ProcResource->Name, I, Masks[I],
3937 ProcResource->NumUnits);
3938 }
3939 dbgs() << " -----------------\n";
3940 }
3941 });
3942}
3943
3945 LLVM_DEBUG({
3946 if (SwpDebugResource)
3947 dbgs() << "canReserveResources:\n";
3948 });
3949 if (UseDFA)
3950 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3951 ->canReserveResources(&SU.getInstr()->getDesc());
3952
3953 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3954 if (!SCDesc->isValid()) {
3955 LLVM_DEBUG({
3956 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3957 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3958 });
3959 return true;
3960 }
3961
3962 reserveResources(SCDesc, Cycle);
3963 bool Result = !isOverbooked();
3964 unreserveResources(SCDesc, Cycle);
3965
3966 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3967 return Result;
3968}
3969
3970void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3971 LLVM_DEBUG({
3972 if (SwpDebugResource)
3973 dbgs() << "reserveResources:\n";
3974 });
3975 if (UseDFA)
3976 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3977 ->reserveResources(&SU.getInstr()->getDesc());
3978
3979 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3980 if (!SCDesc->isValid()) {
3981 LLVM_DEBUG({
3982 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3983 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3984 });
3985 return;
3986 }
3987
3988 reserveResources(SCDesc, Cycle);
3989
3990 LLVM_DEBUG({
3991 if (SwpDebugResource) {
3992 dumpMRT();
3993 dbgs() << "reserveResources: done!\n\n";
3994 }
3995 });
3996}
3997
3998void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3999 int Cycle) {
4000 assert(!UseDFA);
4001 for (const MCWriteProcResEntry &PRE : make_range(
4002 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4003 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4004 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4005
4006 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4007 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
4008}
4009
4010void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
4011 int Cycle) {
4012 assert(!UseDFA);
4013 for (const MCWriteProcResEntry &PRE : make_range(
4014 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4015 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4016 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4017
4018 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4019 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
4020}
4021
4022bool ResourceManager::isOverbooked() const {
4023 assert(!UseDFA);
4024 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
4025 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4026 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4027 if (MRT[Slot][I] > Desc->NumUnits)
4028 return true;
4029 }
4030 if (NumScheduledMops[Slot] > IssueWidth)
4031 return true;
4032 }
4033 return false;
4034}
4035
4036int ResourceManager::calculateResMIIDFA() const {
4037 assert(UseDFA);
4038
4039 // Sort the instructions by the number of available choices for scheduling,
4040 // least to most. Use the number of critical resources as the tie breaker.
4041 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4042 for (SUnit &SU : DAG->SUnits)
4043 FUS.calcCriticalResources(*SU.getInstr());
4044 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4045 FuncUnitOrder(FUS);
4046
4047 for (SUnit &SU : DAG->SUnits)
4048 FuncUnitOrder.push(SU.getInstr());
4049
4051 Resources.push_back(
4052 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4053
4054 while (!FuncUnitOrder.empty()) {
4055 MachineInstr *MI = FuncUnitOrder.top();
4056 FuncUnitOrder.pop();
4057 if (TII->isZeroCost(MI->getOpcode()))
4058 continue;
4059
4060 // Attempt to reserve the instruction in an existing DFA. At least one
4061 // DFA is needed for each cycle.
4062 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4063 unsigned ReservedCycles = 0;
4064 auto *RI = Resources.begin();
4065 auto *RE = Resources.end();
4066 LLVM_DEBUG({
4067 dbgs() << "Trying to reserve resource for " << NumCycles
4068 << " cycles for \n";
4069 MI->dump();
4070 });
4071 for (unsigned C = 0; C < NumCycles; ++C)
4072 while (RI != RE) {
4073 if ((*RI)->canReserveResources(*MI)) {
4074 (*RI)->reserveResources(*MI);
4075 ++ReservedCycles;
4076 break;
4077 }
4078 RI++;
4079 }
4080 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4081 << ", NumCycles:" << NumCycles << "\n");
4082 // Add new DFAs, if needed, to reserve resources.
4083 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4085 << "NewResource created to reserve resources"
4086 << "\n");
4087 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4088 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4089 NewResource->reserveResources(*MI);
4090 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4091 }
4092 }
4093
4094 int Resmii = Resources.size();
4095 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4096 return Resmii;
4097}
4098
4100 if (UseDFA)
4101 return calculateResMIIDFA();
4102
4103 // Count each resource consumption and divide it by the number of units.
4104 // ResMII is the max value among them.
4105
4106 int NumMops = 0;
4107 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4108 for (SUnit &SU : DAG->SUnits) {
4109 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4110 continue;
4111
4112 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4113 if (!SCDesc->isValid())
4114 continue;
4115
4116 LLVM_DEBUG({
4117 if (SwpDebugResource) {
4118 DAG->dumpNode(SU);
4119 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4120 << " WriteProcRes: ";
4121 }
4122 });
4123 NumMops += SCDesc->NumMicroOps;
4124 for (const MCWriteProcResEntry &PRE :
4125 make_range(STI->getWriteProcResBegin(SCDesc),
4126 STI->getWriteProcResEnd(SCDesc))) {
4127 LLVM_DEBUG({
4128 if (SwpDebugResource) {
4129 const MCProcResourceDesc *Desc =
4130 SM.getProcResource(PRE.ProcResourceIdx);
4131 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4132 }
4133 });
4134 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4135 }
4136 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4137 }
4138
4139 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4140 LLVM_DEBUG({
4141 if (SwpDebugResource)
4142 dbgs() << "#Mops: " << NumMops << ", "
4143 << "IssueWidth: " << IssueWidth << ", "
4144 << "Cycles: " << Result << "\n";
4145 });
4146
4147 LLVM_DEBUG({
4148 if (SwpDebugResource) {
4149 std::stringstream SS;
4150 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4151 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4152 << "\n";
4153 dbgs() << SS.str();
4154 }
4155 });
4156 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4157 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4158 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4159 LLVM_DEBUG({
4160 if (SwpDebugResource) {
4161 std::stringstream SS;
4162 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4163 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4164 << std::setw(10) << Cycles << "\n";
4165 dbgs() << SS.str();
4166 }
4167 });
4168 if (Cycles > Result)
4169 Result = Cycles;
4170 }
4171 return Result;
4172}
4173
4175 InitiationInterval = II;
4176 DFAResources.clear();
4177 DFAResources.resize(II);
4178 for (auto &I : DFAResources)
4179 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4180 MRT.clear();
4181 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4182 NumScheduledMops.clear();
4183 NumScheduledMops.resize(II);
4184}
4185
4186bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4187 if (Pred.isArtificial() || Dst->isBoundaryNode())
4188 return true;
4189 // Currently, dependence that is an anti-dependences but not a loop-carried is
4190 // also ignored. This behavior is preserved to prevent regression.
4191 // FIXME: Remove if this doesn't have significant impact on performance
4192 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4193}
4194
4195SwingSchedulerDDG::SwingSchedulerDDGEdges &
4196SwingSchedulerDDG::getEdges(const SUnit *SU) {
4197 if (SU == EntrySU)
4198 return EntrySUEdges;
4199 if (SU == ExitSU)
4200 return ExitSUEdges;
4201 return EdgesVec[SU->NodeNum];
4202}
4203
4204const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4205SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4206 if (SU == EntrySU)
4207 return EntrySUEdges;
4208 if (SU == ExitSU)
4209 return ExitSUEdges;
4210 return EdgesVec[SU->NodeNum];
4211}
4212
4213void SwingSchedulerDDG::addEdge(const SUnit *SU,
4214 const SwingSchedulerDDGEdge &Edge) {
4215 assert(!Edge.isValidationOnly() &&
4216 "Validation-only edges are not expected here.");
4217
4218 auto &Edges = getEdges(SU);
4219 if (Edge.getSrc() == SU)
4220 Edges.Succs.push_back(Edge);
4221 else
4222 Edges.Preds.push_back(Edge);
4223}
4224
4225void SwingSchedulerDDG::initEdges(SUnit *SU) {
4226 for (const auto &PI : SU->Preds) {
4227 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4228 /*IsValidationOnly=*/false);
4229 addEdge(SU, Edge);
4230 }
4231
4232 for (const auto &SI : SU->Succs) {
4233 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4234 /*IsValidationOnly=*/false);
4235 addEdge(SU, Edge);
4236 }
4237}
4238
4239SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4240 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4241 : EntrySU(EntrySU), ExitSU(ExitSU) {
4242 EdgesVec.resize(SUnits.size());
4243
4244 // Add non-loop-carried edges based on the DAG.
4245 initEdges(EntrySU);
4246 initEdges(ExitSU);
4247 for (auto &SU : SUnits)
4248 initEdges(&SU);
4249
4250 // Add loop-carried edges, which are not represented in the DAG.
4251 for (SUnit &SU : SUnits) {
4252 SUnit *Src = &SU;
4253 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4254 SDep Base(Src, SDep::Barrier);
4255 Base.setLatency(1);
4256 for (SUnit *Dst : *OD) {
4257 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4258 /*IsValidationOnly=*/true);
4259 Edge.setDistance(1);
4260 ValidationOnlyEdges.push_back(Edge);
4261
4262 // Store the edge as an extra edge if it meets the following conditions:
4263 //
4264 // - The edge is a loop-carried order dependency.
4265 // - The edge is a back edge in terms of the original instruction
4266 // order.
4267 // - The destination instruction may load.
4268 // - The source instruction may store but does not load.
4269 //
4270 // These conditions are inherited from a previous implementation to
4271 // preserve the existing behavior and avoid regressions.
4272 bool UseAsExtraEdge = [&]() {
4273 if (Edge.getDistance() == 0 || !Edge.isOrderDep())
4274 return false;
4275
4276 SUnit *Src = Edge.getSrc();
4277 SUnit *Dst = Edge.getDst();
4278 if (Src->NodeNum < Dst->NodeNum)
4279 return false;
4280
4281 MachineInstr *SrcMI = Src->getInstr();
4282 MachineInstr *DstMI = Dst->getInstr();
4283 return DstMI->mayLoad() && !SrcMI->mayLoad() && SrcMI->mayStore();
4284 }();
4285 if (UseAsExtraEdge)
4286 getEdges(Edge.getSrc()).ExtraSuccs.push_back(Edge.getDst());
4287 }
4288 }
4289 }
4290}
4291
4292const SwingSchedulerDDG::EdgesType &
4294 return getEdges(SU).Preds;
4295}
4296
4297const SwingSchedulerDDG::EdgesType &
4299 return getEdges(SU).Succs;
4300}
4301
4303 return getEdges(SU).ExtraSuccs;
4304}
4305
4306/// Check if \p Schedule doesn't violate the validation-only dependencies.
4308 unsigned II = Schedule.getInitiationInterval();
4309
4310 auto ExpandCycle = [&](SUnit *SU) {
4311 int Stage = Schedule.stageScheduled(SU);
4312 int Cycle = Schedule.cycleScheduled(SU);
4313 return Cycle + (Stage * II);
4314 };
4315
4316 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4317 SUnit *Src = Edge.getSrc();
4318 SUnit *Dst = Edge.getDst();
4319 if (!Src->isInstr() || !Dst->isInstr())
4320 continue;
4321 int CycleSrc = ExpandCycle(Src);
4322 int CycleDst = ExpandCycle(Dst);
4323 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4324 if (CycleSrc > MaxLateStart) {
4325 LLVM_DEBUG({
4326 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4327 << Dst->NodeNum << "\n";
4328 });
4329 return false;
4330 }
4331 }
4332 return true;
4333}
4334
4335void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4336 const TargetInstrInfo *TII) {
4337 for (SUnit &SU : SUnits) {
4338 SUnit *Src = &SU;
4339 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4340 SDep Dep(Src, SDep::Barrier);
4341 Dep.setLatency(1);
4342 for (SUnit *Dst : *OrderDep) {
4343 SUnit *From = Src;
4344 SUnit *To = Dst;
4345 if (From->NodeNum > To->NodeNum)
4346 std::swap(From, To);
4347
4348 // Add a forward edge if the following conditions are met:
4349 //
4350 // - The instruction of the source node (FromMI) may read memory.
4351 // - The instruction of the target node (ToMI) may modify memory, but
4352 // does not read it.
4353 // - Neither instruction is a global barrier.
4354 // - The load appears before the store in the original basic block.
4355 // - There are no barrier or store instructions between the two nodes.
4356 // - The target node is unreachable from the source node in the current
4357 // DAG.
4358 //
4359 // TODO: These conditions are inherited from a previous implementation,
4360 // and some may no longer be necessary. For now, we conservatively
4361 // retain all of them to avoid regressions, but the logic could
4362 // potentially be simplified
4363 MachineInstr *FromMI = From->getInstr();
4364 MachineInstr *ToMI = To->getInstr();
4365 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4366 !TII->isGlobalMemoryObject(FromMI) &&
4367 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4368 SDep Pred = Dep;
4369 Pred.setSUnit(From);
4370 To->addPred(Pred);
4371 }
4372 }
4373 }
4374 }
4375}
4376
4378 const MachineRegisterInfo *MRI) const {
4379 const auto *Order = getOrderDepOrNull(SU);
4380
4381 if (!Order)
4382 return;
4383
4384 const auto DumpSU = [](const SUnit *SU) {
4385 std::ostringstream OSS;
4386 OSS << "SU(" << SU->NodeNum << ")";
4387 return OSS.str();
4388 };
4389
4390 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4391 << " Order\n";
4392 for (SUnit *Dst : *Order)
4393 dbgs() << " " << DumpSU(Dst) << "\n";
4394}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool hasPHICycleDFS(unsigned Reg, const DenseMap< unsigned, SmallVector< unsigned, 2 > > &PhiDeps, SmallSet< unsigned, 8 > &Visited, SmallSet< unsigned, 8 > &RecStack)
Depth-first search to detect cycles among PHI dependencies.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool hasPHICycle(const MachineBasicBlock *LoopHeader, const MachineRegisterInfo &MRI)
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
static constexpr unsigned SM(unsigned Version)
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:632
MachineInstrBundleIterator< const MachineInstr > const_iterator
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
PSetIterator getPressureSets(VirtRegOrUnit VRegOrUnit) const
Get an iterator over the pressure sets affected by the virtual register or register unit.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
static use_instr_iterator use_instr_end()
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
unsigned getWeight() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
typename vector_type::const_iterator iterator
Definition SetVector.h:72
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
ArrayRef< SUnit * > getExtraOutEdges(const SUnit *SU) const
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr MCRegUnit asMCRegUnit() const
Definition Register.h:201
constexpr Register asVirtualReg() const
Definition Register.h:206
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ Valid
The data is already valid.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
CycleInfo::CycleT Cycle
Definition CycleInfo.h:26
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
RegState getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2052
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.