LLVM 20.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
74#include "llvm/Config/llvm-config.h"
75#include "llvm/IR/Attributes.h"
76#include "llvm/IR/Function.h"
77#include "llvm/MC/LaneBitmask.h"
78#include "llvm/MC/MCInstrDesc.h"
81#include "llvm/Pass.h"
84#include "llvm/Support/Debug.h"
87#include <algorithm>
88#include <cassert>
89#include <climits>
90#include <cstdint>
91#include <deque>
92#include <functional>
93#include <iomanip>
94#include <iterator>
95#include <map>
96#include <memory>
97#include <sstream>
98#include <tuple>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "pipeliner"
105
106STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
107STATISTIC(NumPipelined, "Number of loops software pipelined");
108STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
109STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
110STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
111STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
112STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
113STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
114STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
115STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
116STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
117
118/// A command line option to turn software pipelining on or off.
119static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
120 cl::desc("Enable Software Pipelining"));
121
122/// A command line option to enable SWP at -Os.
123static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
124 cl::desc("Enable SWP at Os."), cl::Hidden,
125 cl::init(false));
126
127/// A command line argument to limit minimum initial interval for pipelining.
128static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
129 cl::desc("Size limit for the MII."),
130 cl::Hidden, cl::init(27));
131
132/// A command line argument to force pipeliner to use specified initial
133/// interval.
134static cl::opt<int> SwpForceII("pipeliner-force-ii",
135 cl::desc("Force pipeliner to use specified II."),
136 cl::Hidden, cl::init(-1));
137
138/// A command line argument to limit the number of stages in the pipeline.
139static cl::opt<int>
140 SwpMaxStages("pipeliner-max-stages",
141 cl::desc("Maximum stages allowed in the generated scheduled."),
142 cl::Hidden, cl::init(3));
143
144/// A command line option to disable the pruning of chain dependences due to
145/// an unrelated Phi.
146static cl::opt<bool>
147 SwpPruneDeps("pipeliner-prune-deps",
148 cl::desc("Prune dependences between unrelated Phi nodes."),
149 cl::Hidden, cl::init(true));
150
151/// A command line option to disable the pruning of loop carried order
152/// dependences.
153static cl::opt<bool>
154 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
155 cl::desc("Prune loop carried order dependences."),
156 cl::Hidden, cl::init(true));
157
158#ifndef NDEBUG
159static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
160#endif
161
162static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
164 cl::desc("Ignore RecMII"));
165
166static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
167 cl::init(false));
168static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
169 cl::init(false));
170
172 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
173 cl::desc("Instead of emitting the pipelined code, annotate instructions "
174 "with the generated schedule for feeding into the "
175 "-modulo-schedule-test pass"));
176
178 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
179 cl::desc(
180 "Use the experimental peeling code generator for software pipelining"));
181
182static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
183 cl::desc("Range to search for II"),
184 cl::Hidden, cl::init(10));
185
186static cl::opt<bool>
187 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
188 cl::desc("Limit register pressure of scheduled loop"));
189
190static cl::opt<int>
191 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
192 cl::init(5),
193 cl::desc("Margin representing the unused percentage of "
194 "the register pressure limit"));
195
196static cl::opt<bool>
197 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
198 cl::desc("Use the MVE code generator for software pipelining"));
199
200namespace llvm {
201
202// A command line option to enable the CopyToPhi DAG mutation.
204 cl::init(true),
205 cl::desc("Enable CopyToPhi DAG Mutation"));
206
207/// A command line argument to force pipeliner to use specified issue
208/// width.
210 "pipeliner-force-issue-width",
211 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
212 cl::init(-1));
213
214/// A command line argument to set the window scheduling option.
217 cl::desc("Set how to use window scheduling algorithm."),
219 "Turn off window algorithm."),
221 "Use window algorithm after SMS algorithm fails."),
223 "Use window algorithm instead of SMS algorithm.")));
224
225} // end namespace llvm
226
227unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
228char MachinePipeliner::ID = 0;
229#ifndef NDEBUG
231#endif
233
235 "Modulo Software Pipelining", false, false)
241 "Modulo Software Pipelining", false, false)
242
243/// The "main" function for implementing Swing Modulo Scheduling.
244bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
245 if (skipFunction(mf.getFunction()))
246 return false;
247
248 if (!EnableSWP)
249 return false;
250
251 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
252 !EnableSWPOptSize.getPosition())
253 return false;
254
255 if (!mf.getSubtarget().enableMachinePipeliner())
256 return false;
257
258 // Cannot pipeline loops without instruction itineraries if we are using
259 // DFA for the pipeliner.
260 if (mf.getSubtarget().useDFAforSMS() &&
261 (!mf.getSubtarget().getInstrItineraryData() ||
262 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
263 return false;
264
265 MF = &mf;
266 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
267 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
268 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
269 TII = MF->getSubtarget().getInstrInfo();
270 RegClassInfo.runOnMachineFunction(*MF);
271
272 for (const auto &L : *MLI)
273 scheduleLoop(*L);
274
275 return false;
276}
277
278/// Attempt to perform the SMS algorithm on the specified loop. This function is
279/// the main entry point for the algorithm. The function identifies candidate
280/// loops, calculates the minimum initiation interval, and attempts to schedule
281/// the loop.
282bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
283 bool Changed = false;
284 for (const auto &InnerLoop : L)
285 Changed |= scheduleLoop(*InnerLoop);
286
287#ifndef NDEBUG
288 // Stop trying after reaching the limit (if any).
289 int Limit = SwpLoopLimit;
290 if (Limit >= 0) {
291 if (NumTries >= SwpLoopLimit)
292 return Changed;
293 NumTries++;
294 }
295#endif
296
297 setPragmaPipelineOptions(L);
298 if (!canPipelineLoop(L)) {
299 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
300 ORE->emit([&]() {
301 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
302 L.getStartLoc(), L.getHeader())
303 << "Failed to pipeline loop";
304 });
305
306 LI.LoopPipelinerInfo.reset();
307 return Changed;
308 }
309
310 ++NumTrytoPipeline;
311 if (useSwingModuloScheduler())
312 Changed = swingModuloScheduler(L);
313
314 if (useWindowScheduler(Changed))
315 Changed = runWindowScheduler(L);
316
317 LI.LoopPipelinerInfo.reset();
318 return Changed;
319}
320
321void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
322 // Reset the pragma for the next loop in iteration.
323 disabledByPragma = false;
324 II_setByPragma = 0;
325
326 MachineBasicBlock *LBLK = L.getTopBlock();
327
328 if (LBLK == nullptr)
329 return;
330
331 const BasicBlock *BBLK = LBLK->getBasicBlock();
332 if (BBLK == nullptr)
333 return;
334
335 const Instruction *TI = BBLK->getTerminator();
336 if (TI == nullptr)
337 return;
338
339 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
340 if (LoopID == nullptr)
341 return;
342
343 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
344 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
345
346 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
347 MDNode *MD = dyn_cast<MDNode>(MDO);
348
349 if (MD == nullptr)
350 continue;
351
352 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
353
354 if (S == nullptr)
355 continue;
356
357 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
358 assert(MD->getNumOperands() == 2 &&
359 "Pipeline initiation interval hint metadata should have two operands.");
361 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
362 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
363 } else if (S->getString() == "llvm.loop.pipeline.disable") {
364 disabledByPragma = true;
365 }
366 }
367}
368
369/// Return true if the loop can be software pipelined. The algorithm is
370/// restricted to loops with a single basic block. Make sure that the
371/// branch in the loop can be analyzed.
372bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
373 if (L.getNumBlocks() != 1) {
374 ORE->emit([&]() {
375 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
376 L.getStartLoc(), L.getHeader())
377 << "Not a single basic block: "
378 << ore::NV("NumBlocks", L.getNumBlocks());
379 });
380 return false;
381 }
382
383 if (disabledByPragma) {
384 ORE->emit([&]() {
385 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
386 L.getStartLoc(), L.getHeader())
387 << "Disabled by Pragma.";
388 });
389 return false;
390 }
391
392 // Check if the branch can't be understood because we can't do pipelining
393 // if that's the case.
394 LI.TBB = nullptr;
395 LI.FBB = nullptr;
396 LI.BrCond.clear();
397 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
398 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
399 NumFailBranch++;
400 ORE->emit([&]() {
401 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
402 L.getStartLoc(), L.getHeader())
403 << "The branch can't be understood";
404 });
405 return false;
406 }
407
408 LI.LoopInductionVar = nullptr;
409 LI.LoopCompare = nullptr;
411 if (!LI.LoopPipelinerInfo) {
412 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
413 NumFailLoop++;
414 ORE->emit([&]() {
415 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
416 L.getStartLoc(), L.getHeader())
417 << "The loop structure is not supported";
418 });
419 return false;
420 }
421
422 if (!L.getLoopPreheader()) {
423 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
424 NumFailPreheader++;
425 ORE->emit([&]() {
426 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
427 L.getStartLoc(), L.getHeader())
428 << "No loop preheader found";
429 });
430 return false;
431 }
432
433 // Remove any subregisters from inputs to phi nodes.
434 preprocessPhiNodes(*L.getHeader());
435 return true;
436}
437
438void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
440 SlotIndexes &Slots =
441 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
442
443 for (MachineInstr &PI : B.phis()) {
444 MachineOperand &DefOp = PI.getOperand(0);
445 assert(DefOp.getSubReg() == 0);
446 auto *RC = MRI.getRegClass(DefOp.getReg());
447
448 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
449 MachineOperand &RegOp = PI.getOperand(i);
450 if (RegOp.getSubReg() == 0)
451 continue;
452
453 // If the operand uses a subregister, replace it with a new register
454 // without subregisters, and generate a copy to the new register.
455 Register NewReg = MRI.createVirtualRegister(RC);
456 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
458 const DebugLoc &DL = PredB.findDebugLoc(At);
459 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
460 .addReg(RegOp.getReg(), getRegState(RegOp),
461 RegOp.getSubReg());
462 Slots.insertMachineInstrInMaps(*Copy);
463 RegOp.setReg(NewReg);
464 RegOp.setSubReg(0);
465 }
466 }
467}
468
469/// The SMS algorithm consists of the following main steps:
470/// 1. Computation and analysis of the dependence graph.
471/// 2. Ordering of the nodes (instructions).
472/// 3. Attempt to Schedule the loop.
473bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
474 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
475
477 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
479
480 MachineBasicBlock *MBB = L.getHeader();
481 // The kernel should not include any terminator instructions. These
482 // will be added back later.
483 SMS.startBlock(MBB);
484
485 // Compute the number of 'real' instructions in the basic block by
486 // ignoring terminators.
487 unsigned size = MBB->size();
489 E = MBB->instr_end();
490 I != E; ++I, --size)
491 ;
492
493 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
494 SMS.schedule();
495 SMS.exitRegion();
496
497 SMS.finishBlock();
498 return SMS.hasNewSchedule();
499}
500
510}
511
512bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
513 MachineSchedContext Context;
514 Context.MF = MF;
515 Context.MLI = MLI;
516 Context.MDT = MDT;
517 Context.PassConfig = &getAnalysis<TargetPassConfig>();
518 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
519 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
521 WindowScheduler WS(&Context, L);
522 return WS.run();
523}
524
525bool MachinePipeliner::useSwingModuloScheduler() {
526 // SwingModuloScheduler does not work when WindowScheduler is forced.
528}
529
530bool MachinePipeliner::useWindowScheduler(bool Changed) {
531 // WindowScheduler does not work for following cases:
532 // 1. when it is off.
533 // 2. when SwingModuloScheduler is successfully scheduled.
534 // 3. when pragma II is enabled.
535 if (II_setByPragma) {
536 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
537 "llvm.loop.pipeline.initiationinterval is set.\n");
538 return false;
539 }
540
543}
544
545void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
546 if (SwpForceII > 0)
547 MII = SwpForceII;
548 else if (II_setByPragma > 0)
549 MII = II_setByPragma;
550 else
551 MII = std::max(ResMII, RecMII);
552}
553
554void SwingSchedulerDAG::setMAX_II() {
555 if (SwpForceII > 0)
556 MAX_II = SwpForceII;
557 else if (II_setByPragma > 0)
558 MAX_II = II_setByPragma;
559 else
560 MAX_II = MII + SwpIISearchRange;
561}
562
563/// We override the schedule function in ScheduleDAGInstrs to implement the
564/// scheduling part of the Swing Modulo Scheduling algorithm.
566 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
567 buildSchedGraph(AA);
568 addLoopCarriedDependences(AA);
569 updatePhiDependences();
571 changeDependences();
572 postProcessDAG();
573 LLVM_DEBUG(dump());
574
575 NodeSetType NodeSets;
576 findCircuits(NodeSets);
577 NodeSetType Circuits = NodeSets;
578
579 // Calculate the MII.
580 unsigned ResMII = calculateResMII();
581 unsigned RecMII = calculateRecMII(NodeSets);
582
583 fuseRecs(NodeSets);
584
585 // This flag is used for testing and can cause correctness problems.
586 if (SwpIgnoreRecMII)
587 RecMII = 0;
588
589 setMII(ResMII, RecMII);
590 setMAX_II();
591
592 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
593 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
594
595 // Can't schedule a loop without a valid MII.
596 if (MII == 0) {
597 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
598 NumFailZeroMII++;
599 Pass.ORE->emit([&]() {
601 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
602 << "Invalid Minimal Initiation Interval: 0";
603 });
604 return;
605 }
606
607 // Don't pipeline large loops.
608 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
609 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
610 << ", we don't pipeline large loops\n");
611 NumFailLargeMaxMII++;
612 Pass.ORE->emit([&]() {
614 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
615 << "Minimal Initiation Interval too large: "
616 << ore::NV("MII", (int)MII) << " > "
617 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
618 << "Refer to -pipeliner-max-mii.";
619 });
620 return;
621 }
622
623 computeNodeFunctions(NodeSets);
624
625 registerPressureFilter(NodeSets);
626
627 colocateNodeSets(NodeSets);
628
629 checkNodeSets(NodeSets);
630
631 LLVM_DEBUG({
632 for (auto &I : NodeSets) {
633 dbgs() << " Rec NodeSet ";
634 I.dump();
635 }
636 });
637
638 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
639
640 groupRemainingNodes(NodeSets);
641
642 removeDuplicateNodes(NodeSets);
643
644 LLVM_DEBUG({
645 for (auto &I : NodeSets) {
646 dbgs() << " NodeSet ";
647 I.dump();
648 }
649 });
650
651 computeNodeOrder(NodeSets);
652
653 // check for node order issues
654 checkValidNodeOrder(Circuits);
655
656 SMSchedule Schedule(Pass.MF, this);
657 Scheduled = schedulePipeline(Schedule);
658
659 if (!Scheduled){
660 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
661 NumFailNoSchedule++;
662 Pass.ORE->emit([&]() {
664 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
665 << "Unable to find schedule";
666 });
667 return;
668 }
669
670 unsigned numStages = Schedule.getMaxStageCount();
671 // No need to generate pipeline if there are no overlapped iterations.
672 if (numStages == 0) {
673 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
674 NumFailZeroStage++;
675 Pass.ORE->emit([&]() {
677 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
678 << "No need to pipeline - no overlapped iterations in schedule.";
679 });
680 return;
681 }
682 // Check that the maximum stage count is less than user-defined limit.
683 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
684 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
685 << " : too many stages, abort\n");
686 NumFailLargeMaxStage++;
687 Pass.ORE->emit([&]() {
689 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
690 << "Too many stages in schedule: "
691 << ore::NV("numStages", (int)numStages) << " > "
692 << ore::NV("SwpMaxStages", SwpMaxStages)
693 << ". Refer to -pipeliner-max-stages.";
694 });
695 return;
696 }
697
698 Pass.ORE->emit([&]() {
700 Loop.getHeader())
701 << "Pipelined succesfully!";
702 });
703
704 // Generate the schedule as a ModuloSchedule.
705 DenseMap<MachineInstr *, int> Cycles, Stages;
706 std::vector<MachineInstr *> OrderedInsts;
707 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
708 ++Cycle) {
709 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
710 OrderedInsts.push_back(SU->getInstr());
711 Cycles[SU->getInstr()] = Cycle;
712 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
713 }
714 }
716 for (auto &KV : NewMIs) {
717 Cycles[KV.first] = Cycles[KV.second];
718 Stages[KV.first] = Stages[KV.second];
719 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
720 }
721
722 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
723 std::move(Stages));
725 assert(NewInstrChanges.empty() &&
726 "Cannot serialize a schedule with InstrChanges!");
728 MSTI.annotate();
729 return;
730 }
731 // The experimental code generator can't work if there are InstChanges.
732 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
733 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
734 MSE.expand();
735 } else if (MVECodeGen && NewInstrChanges.empty() &&
736 LoopPipelinerInfo->isMVEExpanderSupported() &&
738 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
739 MSE.expand();
740 } else {
741 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
742 MSE.expand();
743 MSE.cleanup();
744 }
745 ++NumPipelined;
746}
747
748/// Clean up after the software pipeliner runs.
750 for (auto &KV : NewMIs)
751 MF.deleteMachineInstr(KV.second);
752 NewMIs.clear();
753
754 // Call the superclass.
756}
757
758/// Return the register values for the operands of a Phi instruction.
759/// This function assume the instruction is a Phi.
761 unsigned &InitVal, unsigned &LoopVal) {
762 assert(Phi.isPHI() && "Expecting a Phi.");
763
764 InitVal = 0;
765 LoopVal = 0;
766 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
767 if (Phi.getOperand(i + 1).getMBB() != Loop)
768 InitVal = Phi.getOperand(i).getReg();
769 else
770 LoopVal = Phi.getOperand(i).getReg();
771
772 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
773}
774
775/// Return the Phi register value that comes the loop block.
776static unsigned getLoopPhiReg(const MachineInstr &Phi,
777 const MachineBasicBlock *LoopBB) {
778 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
779 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
780 return Phi.getOperand(i).getReg();
781 return 0;
782}
783
784/// Return true if SUb can be reached from SUa following the chain edges.
785static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
788 Worklist.push_back(SUa);
789 while (!Worklist.empty()) {
790 const SUnit *SU = Worklist.pop_back_val();
791 for (const auto &SI : SU->Succs) {
792 SUnit *SuccSU = SI.getSUnit();
793 if (SI.getKind() == SDep::Order) {
794 if (Visited.count(SuccSU))
795 continue;
796 if (SuccSU == SUb)
797 return true;
798 Worklist.push_back(SuccSU);
799 Visited.insert(SuccSU);
800 }
801 }
802 }
803 return false;
804}
805
806/// Return true if the instruction causes a chain between memory
807/// references before and after it.
809 return MI.isCall() || MI.mayRaiseFPException() ||
810 MI.hasUnmodeledSideEffects() ||
811 (MI.hasOrderedMemoryRef() &&
812 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
813}
814
815/// Return the underlying objects for the memory references of an instruction.
816/// This function calls the code in ValueTracking, but first checks that the
817/// instruction has a memory operand.
820 if (!MI->hasOneMemOperand())
821 return;
822 MachineMemOperand *MM = *MI->memoperands_begin();
823 if (!MM->getValue())
824 return;
825 getUnderlyingObjects(MM->getValue(), Objs);
826 for (const Value *V : Objs) {
827 if (!isIdentifiedObject(V)) {
828 Objs.clear();
829 return;
830 }
831 }
832}
833
834/// Add a chain edge between a load and store if the store can be an
835/// alias of the load on a subsequent iteration, i.e., a loop carried
836/// dependence. This code is very similar to the code in ScheduleDAGInstrs
837/// but that code doesn't create loop carried dependences.
838void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
842 for (auto &SU : SUnits) {
843 MachineInstr &MI = *SU.getInstr();
845 PendingLoads.clear();
846 else if (MI.mayLoad()) {
849 if (Objs.empty())
851 for (const auto *V : Objs) {
852 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
853 SUs.push_back(&SU);
854 }
855 } else if (MI.mayStore()) {
858 if (Objs.empty())
860 for (const auto *V : Objs) {
862 PendingLoads.find(V);
863 if (I == PendingLoads.end())
864 continue;
865 for (auto *Load : I->second) {
866 if (isSuccOrder(Load, &SU))
867 continue;
868 MachineInstr &LdMI = *Load->getInstr();
869 // First, perform the cheaper check that compares the base register.
870 // If they are the same and the load offset is less than the store
871 // offset, then mark the dependence as loop carried potentially.
872 const MachineOperand *BaseOp1, *BaseOp2;
873 int64_t Offset1, Offset2;
874 bool Offset1IsScalable, Offset2IsScalable;
875 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
876 Offset1IsScalable, TRI) &&
877 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
878 Offset2IsScalable, TRI)) {
879 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
880 Offset1IsScalable == Offset2IsScalable &&
881 (int)Offset1 < (int)Offset2) {
883 "What happened to the chain edge?");
884 SDep Dep(Load, SDep::Barrier);
885 Dep.setLatency(1);
886 SU.addPred(Dep);
887 continue;
888 }
889 }
890 // Second, the more expensive check that uses alias analysis on the
891 // base registers. If they alias, and the load offset is less than
892 // the store offset, the mark the dependence as loop carried.
893 if (!AA) {
894 SDep Dep(Load, SDep::Barrier);
895 Dep.setLatency(1);
896 SU.addPred(Dep);
897 continue;
898 }
899 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
900 MachineMemOperand *MMO2 = *MI.memoperands_begin();
901 if (!MMO1->getValue() || !MMO2->getValue()) {
902 SDep Dep(Load, SDep::Barrier);
903 Dep.setLatency(1);
904 SU.addPred(Dep);
905 continue;
906 }
907 if (MMO1->getValue() == MMO2->getValue() &&
908 MMO1->getOffset() <= MMO2->getOffset()) {
909 SDep Dep(Load, SDep::Barrier);
910 Dep.setLatency(1);
911 SU.addPred(Dep);
912 continue;
913 }
914 if (!AA->isNoAlias(
917 MMO2->getAAInfo()))) {
918 SDep Dep(Load, SDep::Barrier);
919 Dep.setLatency(1);
920 SU.addPred(Dep);
921 }
922 }
923 }
924 }
925 }
926}
927
928/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
929/// processes dependences for PHIs. This function adds true dependences
930/// from a PHI to a use, and a loop carried dependence from the use to the
931/// PHI. The loop carried dependence is represented as an anti dependence
932/// edge. This function also removes chain dependences between unrelated
933/// PHIs.
934void SwingSchedulerDAG::updatePhiDependences() {
935 SmallVector<SDep, 4> RemoveDeps;
937
938 // Iterate over each DAG node.
939 for (SUnit &I : SUnits) {
940 RemoveDeps.clear();
941 // Set to true if the instruction has an operand defined by a Phi.
942 unsigned HasPhiUse = 0;
943 unsigned HasPhiDef = 0;
944 MachineInstr *MI = I.getInstr();
945 // Iterate over each operand, and we process the definitions.
946 for (const MachineOperand &MO : MI->operands()) {
947 if (!MO.isReg())
948 continue;
949 Register Reg = MO.getReg();
950 if (MO.isDef()) {
951 // If the register is used by a Phi, then create an anti dependence.
953 UI = MRI.use_instr_begin(Reg),
954 UE = MRI.use_instr_end();
955 UI != UE; ++UI) {
956 MachineInstr *UseMI = &*UI;
957 SUnit *SU = getSUnit(UseMI);
958 if (SU != nullptr && UseMI->isPHI()) {
959 if (!MI->isPHI()) {
960 SDep Dep(SU, SDep::Anti, Reg);
961 Dep.setLatency(1);
962 I.addPred(Dep);
963 } else {
964 HasPhiDef = Reg;
965 // Add a chain edge to a dependent Phi that isn't an existing
966 // predecessor.
967 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
968 I.addPred(SDep(SU, SDep::Barrier));
969 }
970 }
971 }
972 } else if (MO.isUse()) {
973 // If the register is defined by a Phi, then create a true dependence.
975 if (DefMI == nullptr)
976 continue;
977 SUnit *SU = getSUnit(DefMI);
978 if (SU != nullptr && DefMI->isPHI()) {
979 if (!MI->isPHI()) {
980 SDep Dep(SU, SDep::Data, Reg);
981 Dep.setLatency(0);
982 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
983 &SchedModel);
984 I.addPred(Dep);
985 } else {
986 HasPhiUse = Reg;
987 // Add a chain edge to a dependent Phi that isn't an existing
988 // predecessor.
989 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
990 I.addPred(SDep(SU, SDep::Barrier));
991 }
992 }
993 }
994 }
995 // Remove order dependences from an unrelated Phi.
996 if (!SwpPruneDeps)
997 continue;
998 for (auto &PI : I.Preds) {
999 MachineInstr *PMI = PI.getSUnit()->getInstr();
1000 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1001 if (I.getInstr()->isPHI()) {
1002 if (PMI->getOperand(0).getReg() == HasPhiUse)
1003 continue;
1004 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1005 continue;
1006 }
1007 RemoveDeps.push_back(PI);
1008 }
1009 }
1010 for (const SDep &D : RemoveDeps)
1011 I.removePred(D);
1012 }
1013}
1014
1015/// Iterate over each DAG node and see if we can change any dependences
1016/// in order to reduce the recurrence MII.
1017void SwingSchedulerDAG::changeDependences() {
1018 // See if an instruction can use a value from the previous iteration.
1019 // If so, we update the base and offset of the instruction and change
1020 // the dependences.
1021 for (SUnit &I : SUnits) {
1022 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1023 int64_t NewOffset = 0;
1024 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1025 NewOffset))
1026 continue;
1027
1028 // Get the MI and SUnit for the instruction that defines the original base.
1029 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1031 if (!DefMI)
1032 continue;
1033 SUnit *DefSU = getSUnit(DefMI);
1034 if (!DefSU)
1035 continue;
1036 // Get the MI and SUnit for the instruction that defins the new base.
1037 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1038 if (!LastMI)
1039 continue;
1040 SUnit *LastSU = getSUnit(LastMI);
1041 if (!LastSU)
1042 continue;
1043
1044 if (Topo.IsReachable(&I, LastSU))
1045 continue;
1046
1047 // Remove the dependence. The value now depends on a prior iteration.
1049 for (const SDep &P : I.Preds)
1050 if (P.getSUnit() == DefSU)
1051 Deps.push_back(P);
1052 for (const SDep &D : Deps) {
1053 Topo.RemovePred(&I, D.getSUnit());
1054 I.removePred(D);
1055 }
1056 // Remove the chain dependence between the instructions.
1057 Deps.clear();
1058 for (auto &P : LastSU->Preds)
1059 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1060 Deps.push_back(P);
1061 for (const SDep &D : Deps) {
1062 Topo.RemovePred(LastSU, D.getSUnit());
1063 LastSU->removePred(D);
1064 }
1065
1066 // Add a dependence between the new instruction and the instruction
1067 // that defines the new base.
1068 SDep Dep(&I, SDep::Anti, NewBase);
1069 Topo.AddPred(LastSU, &I);
1070 LastSU->addPred(Dep);
1071
1072 // Remember the base and offset information so that we can update the
1073 // instruction during code generation.
1074 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1075 }
1076}
1077
1078/// Create an instruction stream that represents a single iteration and stage of
1079/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1080/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1081/// function is an approximation of SMSchedule::finalizeSchedule with all
1082/// non-const operations removed.
1084 SMSchedule &Schedule,
1085 std::vector<MachineInstr *> &OrderedInsts,
1088
1089 // Move all instructions to the first stage from the later stages.
1090 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1091 ++Cycle) {
1092 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1093 Stage <= LastStage; ++Stage) {
1094 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1095 Cycle + Stage * Schedule.getInitiationInterval()))) {
1096 Instrs[Cycle].push_front(SU);
1097 }
1098 }
1099 }
1100
1101 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1102 ++Cycle) {
1103 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1104 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1105 for (SUnit *SU : CycleInstrs) {
1106 MachineInstr *MI = SU->getInstr();
1107 OrderedInsts.push_back(MI);
1108 Stages[MI] = Schedule.stageScheduled(SU);
1109 }
1110 }
1111}
1112
1113namespace {
1114
1115// FuncUnitSorter - Comparison operator used to sort instructions by
1116// the number of functional unit choices.
1117struct FuncUnitSorter {
1118 const InstrItineraryData *InstrItins;
1119 const MCSubtargetInfo *STI;
1121
1122 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1123 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1124
1125 // Compute the number of functional unit alternatives needed
1126 // at each stage, and take the minimum value. We prioritize the
1127 // instructions by the least number of choices first.
1128 unsigned minFuncUnits(const MachineInstr *Inst,
1129 InstrStage::FuncUnits &F) const {
1130 unsigned SchedClass = Inst->getDesc().getSchedClass();
1131 unsigned min = UINT_MAX;
1132 if (InstrItins && !InstrItins->isEmpty()) {
1133 for (const InstrStage &IS :
1134 make_range(InstrItins->beginStage(SchedClass),
1135 InstrItins->endStage(SchedClass))) {
1136 InstrStage::FuncUnits funcUnits = IS.getUnits();
1137 unsigned numAlternatives = llvm::popcount(funcUnits);
1138 if (numAlternatives < min) {
1139 min = numAlternatives;
1140 F = funcUnits;
1141 }
1142 }
1143 return min;
1144 }
1145 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1146 const MCSchedClassDesc *SCDesc =
1147 STI->getSchedModel().getSchedClassDesc(SchedClass);
1148 if (!SCDesc->isValid())
1149 // No valid Schedule Class Desc for schedClass, should be
1150 // Pseudo/PostRAPseudo
1151 return min;
1152
1153 for (const MCWriteProcResEntry &PRE :
1154 make_range(STI->getWriteProcResBegin(SCDesc),
1155 STI->getWriteProcResEnd(SCDesc))) {
1156 if (!PRE.ReleaseAtCycle)
1157 continue;
1158 const MCProcResourceDesc *ProcResource =
1159 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1160 unsigned NumUnits = ProcResource->NumUnits;
1161 if (NumUnits < min) {
1162 min = NumUnits;
1163 F = PRE.ProcResourceIdx;
1164 }
1165 }
1166 return min;
1167 }
1168 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1169 }
1170
1171 // Compute the critical resources needed by the instruction. This
1172 // function records the functional units needed by instructions that
1173 // must use only one functional unit. We use this as a tie breaker
1174 // for computing the resource MII. The instrutions that require
1175 // the same, highly used, functional unit have high priority.
1176 void calcCriticalResources(MachineInstr &MI) {
1177 unsigned SchedClass = MI.getDesc().getSchedClass();
1178 if (InstrItins && !InstrItins->isEmpty()) {
1179 for (const InstrStage &IS :
1180 make_range(InstrItins->beginStage(SchedClass),
1181 InstrItins->endStage(SchedClass))) {
1182 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1183 if (llvm::popcount(FuncUnits) == 1)
1184 Resources[FuncUnits]++;
1185 }
1186 return;
1187 }
1188 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1189 const MCSchedClassDesc *SCDesc =
1190 STI->getSchedModel().getSchedClassDesc(SchedClass);
1191 if (!SCDesc->isValid())
1192 // No valid Schedule Class Desc for schedClass, should be
1193 // Pseudo/PostRAPseudo
1194 return;
1195
1196 for (const MCWriteProcResEntry &PRE :
1197 make_range(STI->getWriteProcResBegin(SCDesc),
1198 STI->getWriteProcResEnd(SCDesc))) {
1199 if (!PRE.ReleaseAtCycle)
1200 continue;
1201 Resources[PRE.ProcResourceIdx]++;
1202 }
1203 return;
1204 }
1205 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1206 }
1207
1208 /// Return true if IS1 has less priority than IS2.
1209 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1210 InstrStage::FuncUnits F1 = 0, F2 = 0;
1211 unsigned MFUs1 = minFuncUnits(IS1, F1);
1212 unsigned MFUs2 = minFuncUnits(IS2, F2);
1213 if (MFUs1 == MFUs2)
1214 return Resources.lookup(F1) < Resources.lookup(F2);
1215 return MFUs1 > MFUs2;
1216 }
1217};
1218
1219/// Calculate the maximum register pressure of the scheduled instructions stream
1220class HighRegisterPressureDetector {
1221 MachineBasicBlock *OrigMBB;
1222 const MachineFunction &MF;
1223 const MachineRegisterInfo &MRI;
1224 const TargetRegisterInfo *TRI;
1225
1226 const unsigned PSetNum;
1227
1228 // Indexed by PSet ID
1229 // InitSetPressure takes into account the register pressure of live-in
1230 // registers. It's not depend on how the loop is scheduled, so it's enough to
1231 // calculate them once at the beginning.
1232 std::vector<unsigned> InitSetPressure;
1233
1234 // Indexed by PSet ID
1235 // Upper limit for each register pressure set
1236 std::vector<unsigned> PressureSetLimit;
1237
1239
1241
1242public:
1243 using OrderedInstsTy = std::vector<MachineInstr *>;
1244 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1245
1246private:
1247 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1248 if (Pressures.size() == 0) {
1249 dbgs() << "[]";
1250 } else {
1251 char Prefix = '[';
1252 for (unsigned P : Pressures) {
1253 dbgs() << Prefix << P;
1254 Prefix = ' ';
1255 }
1256 dbgs() << ']';
1257 }
1258 }
1259
1260 void dumpPSet(Register Reg) const {
1261 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1262 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1263 ++PSetIter) {
1264 dbgs() << *PSetIter << ' ';
1265 }
1266 dbgs() << '\n';
1267 }
1268
1269 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1270 Register Reg) const {
1271 auto PSetIter = MRI.getPressureSets(Reg);
1272 unsigned Weight = PSetIter.getWeight();
1273 for (; PSetIter.isValid(); ++PSetIter)
1274 Pressure[*PSetIter] += Weight;
1275 }
1276
1277 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1278 Register Reg) const {
1279 auto PSetIter = MRI.getPressureSets(Reg);
1280 unsigned Weight = PSetIter.getWeight();
1281 for (; PSetIter.isValid(); ++PSetIter) {
1282 auto &P = Pressure[*PSetIter];
1283 assert(P >= Weight &&
1284 "register pressure must be greater than or equal weight");
1285 P -= Weight;
1286 }
1287 }
1288
1289 // Return true if Reg is fixed one, for example, stack pointer
1290 bool isFixedRegister(Register Reg) const {
1291 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1292 }
1293
1294 bool isDefinedInThisLoop(Register Reg) const {
1295 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1296 }
1297
1298 // Search for live-in variables. They are factored into the register pressure
1299 // from the begining. Live-in variables used by every iteration should be
1300 // considered as alive throughout the loop. For example, the variable `c` in
1301 // following code. \code
1302 // int c = ...;
1303 // for (int i = 0; i < n; i++)
1304 // a[i] += b[i] + c;
1305 // \endcode
1306 void computeLiveIn() {
1308 for (auto &MI : *OrigMBB) {
1309 if (MI.isDebugInstr())
1310 continue;
1311 for (auto &Use : ROMap[&MI].Uses) {
1312 auto Reg = Use.RegUnit;
1313 // Ignore the variable that appears only on one side of phi instruction
1314 // because it's used only at the first iteration.
1315 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1316 continue;
1317 if (isFixedRegister(Reg))
1318 continue;
1319 if (isDefinedInThisLoop(Reg))
1320 continue;
1321 Used.insert(Reg);
1322 }
1323 }
1324
1325 for (auto LiveIn : Used)
1326 increaseRegisterPressure(InitSetPressure, LiveIn);
1327 }
1328
1329 // Calculate the upper limit of each pressure set
1330 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1331 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1332 PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
1333
1334 // We assume fixed registers, such as stack pointer, are already in use.
1335 // Therefore subtracting the weight of the fixed registers from the limit of
1336 // each pressure set in advance.
1338 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1339 for (const MCPhysReg Reg : *TRC)
1340 if (isFixedRegister(Reg))
1341 FixedRegs.insert(Reg);
1342 }
1343
1344 LLVM_DEBUG({
1345 for (auto Reg : FixedRegs) {
1346 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1347 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1348 for (; *Sets != -1; Sets++) {
1349 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1350 }
1351 dbgs() << "]\n";
1352 }
1353 });
1354
1355 for (auto Reg : FixedRegs) {
1356 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1357 << "\n");
1358 auto PSetIter = MRI.getPressureSets(Reg);
1359 unsigned Weight = PSetIter.getWeight();
1360 for (; PSetIter.isValid(); ++PSetIter) {
1361 unsigned &Limit = PressureSetLimit[*PSetIter];
1362 assert(Limit >= Weight &&
1363 "register pressure limit must be greater than or equal weight");
1364 Limit -= Weight;
1365 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1366 << " (decreased by " << Weight << ")\n");
1367 }
1368 }
1369 }
1370
1371 // There are two patterns of last-use.
1372 // - by an instruction of the current iteration
1373 // - by a phi instruction of the next iteration (loop carried value)
1374 //
1375 // Furthermore, following two groups of instructions are executed
1376 // simultaneously
1377 // - next iteration's phi instructions in i-th stage
1378 // - current iteration's instructions in i+1-th stage
1379 //
1380 // This function calculates the last-use of each register while taking into
1381 // account the above two patterns.
1382 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1383 Instr2StageTy &Stages) const {
1384 // We treat virtual registers that are defined and used in this loop.
1385 // Following virtual register will be ignored
1386 // - live-in one
1387 // - defined but not used in the loop (potentially live-out)
1388 DenseSet<Register> TargetRegs;
1389 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1390 if (isDefinedInThisLoop(Reg))
1391 TargetRegs.insert(Reg);
1392 };
1393 for (MachineInstr *MI : OrderedInsts) {
1394 if (MI->isPHI()) {
1395 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1396 UpdateTargetRegs(Reg);
1397 } else {
1398 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1399 UpdateTargetRegs(Use.RegUnit);
1400 }
1401 }
1402
1403 const auto InstrScore = [&Stages](MachineInstr *MI) {
1404 return Stages[MI] + MI->isPHI();
1405 };
1406
1408 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1409 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1410 auto Reg = Use.RegUnit;
1411 if (!TargetRegs.contains(Reg))
1412 continue;
1413 auto Ite = LastUseMI.find(Reg);
1414 if (Ite == LastUseMI.end()) {
1415 LastUseMI[Reg] = MI;
1416 } else {
1417 MachineInstr *Orig = Ite->second;
1418 MachineInstr *New = MI;
1419 if (InstrScore(Orig) < InstrScore(New))
1420 LastUseMI[Reg] = New;
1421 }
1422 }
1423 }
1424
1425 Instr2LastUsesTy LastUses;
1426 for (auto &Entry : LastUseMI)
1427 LastUses[Entry.second].insert(Entry.first);
1428 return LastUses;
1429 }
1430
1431 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1432 // iterations and check the register pressure at the point where all stages
1433 // overlapping.
1434 //
1435 // An example of unrolled loop where #Stage is 4..
1436 // Iter i+0 i+1 i+2 i+3
1437 // ------------------------
1438 // Stage 0
1439 // Stage 1 0
1440 // Stage 2 1 0
1441 // Stage 3 2 1 0 <- All stages overlap
1442 //
1443 std::vector<unsigned>
1444 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1445 Instr2StageTy &Stages,
1446 const unsigned StageCount) const {
1447 using RegSetTy = SmallDenseSet<Register, 16>;
1448
1449 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1450 // manage the liveness of the registers independently by iterations.
1451 SmallVector<RegSetTy> LiveRegSets(StageCount);
1452
1453 auto CurSetPressure = InitSetPressure;
1454 auto MaxSetPressure = InitSetPressure;
1455 auto LastUses = computeLastUses(OrderedInsts, Stages);
1456
1457 LLVM_DEBUG({
1458 dbgs() << "Ordered instructions:\n";
1459 for (MachineInstr *MI : OrderedInsts) {
1460 dbgs() << "Stage " << Stages[MI] << ": ";
1461 MI->dump();
1462 }
1463 });
1464
1465 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1466 Register Reg) {
1467 if (!Reg.isValid() || isFixedRegister(Reg))
1468 return;
1469
1470 bool Inserted = RegSet.insert(Reg).second;
1471 if (!Inserted)
1472 return;
1473
1474 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1475 increaseRegisterPressure(CurSetPressure, Reg);
1476 LLVM_DEBUG(dumpPSet(Reg));
1477 };
1478
1479 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1480 Register Reg) {
1481 if (!Reg.isValid() || isFixedRegister(Reg))
1482 return;
1483
1484 // live-in register
1485 if (!RegSet.contains(Reg))
1486 return;
1487
1488 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1489 RegSet.erase(Reg);
1490 decreaseRegisterPressure(CurSetPressure, Reg);
1491 LLVM_DEBUG(dumpPSet(Reg));
1492 };
1493
1494 for (unsigned I = 0; I < StageCount; I++) {
1495 for (MachineInstr *MI : OrderedInsts) {
1496 const auto Stage = Stages[MI];
1497 if (I < Stage)
1498 continue;
1499
1500 const unsigned Iter = I - Stage;
1501
1502 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1503 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1504
1505 for (auto LastUse : LastUses[MI]) {
1506 if (MI->isPHI()) {
1507 if (Iter != 0)
1508 EraseReg(LiveRegSets[Iter - 1], LastUse);
1509 } else {
1510 EraseReg(LiveRegSets[Iter], LastUse);
1511 }
1512 }
1513
1514 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1515 MaxSetPressure[PSet] =
1516 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1517
1518 LLVM_DEBUG({
1519 dbgs() << "CurSetPressure=";
1520 dumpRegisterPressures(CurSetPressure);
1521 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1522 MI->dump();
1523 });
1524 }
1525 }
1526
1527 return MaxSetPressure;
1528 }
1529
1530public:
1531 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1532 const MachineFunction &MF)
1533 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1534 TRI(MF.getSubtarget().getRegisterInfo()),
1535 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1536 PressureSetLimit(PSetNum, 0) {}
1537
1538 // Used to calculate register pressure, which is independent of loop
1539 // scheduling.
1540 void init(const RegisterClassInfo &RCI) {
1541 for (MachineInstr &MI : *OrigMBB) {
1542 if (MI.isDebugInstr())
1543 continue;
1544 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1545 }
1546
1547 computeLiveIn();
1548 computePressureSetLimit(RCI);
1549 }
1550
1551 // Calculate the maximum register pressures of the loop and check if they
1552 // exceed the limit
1553 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1554 const unsigned MaxStage) const {
1556 "the percentage of the margin must be between 0 to 100");
1557
1558 OrderedInstsTy OrderedInsts;
1559 Instr2StageTy Stages;
1560 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1561 const auto MaxSetPressure =
1562 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1563
1564 LLVM_DEBUG({
1565 dbgs() << "Dump MaxSetPressure:\n";
1566 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1567 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1568 }
1569 dbgs() << '\n';
1570 });
1571
1572 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1573 unsigned Limit = PressureSetLimit[PSet];
1574 unsigned Margin = Limit * RegPressureMargin / 100;
1575 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1576 << " Margin=" << Margin << "\n");
1577 if (Limit < MaxSetPressure[PSet] + Margin) {
1578 LLVM_DEBUG(
1579 dbgs()
1580 << "Rejected the schedule because of too high register pressure\n");
1581 return true;
1582 }
1583 }
1584 return false;
1585 }
1586};
1587
1588} // end anonymous namespace
1589
1590/// Calculate the resource constrained minimum initiation interval for the
1591/// specified loop. We use the DFA to model the resources needed for
1592/// each instruction, and we ignore dependences. A different DFA is created
1593/// for each cycle that is required. When adding a new instruction, we attempt
1594/// to add it to each existing DFA, until a legal space is found. If the
1595/// instruction cannot be reserved in an existing DFA, we create a new one.
1596unsigned SwingSchedulerDAG::calculateResMII() {
1597 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1599 return RM.calculateResMII();
1600}
1601
1602/// Calculate the recurrence-constrainted minimum initiation interval.
1603/// Iterate over each circuit. Compute the delay(c) and distance(c)
1604/// for each circuit. The II needs to satisfy the inequality
1605/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1606/// II that satisfies the inequality, and the RecMII is the maximum
1607/// of those values.
1608unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1609 unsigned RecMII = 0;
1610
1611 for (NodeSet &Nodes : NodeSets) {
1612 if (Nodes.empty())
1613 continue;
1614
1615 unsigned Delay = Nodes.getLatency();
1616 unsigned Distance = 1;
1617
1618 // ii = ceil(delay / distance)
1619 unsigned CurMII = (Delay + Distance - 1) / Distance;
1620 Nodes.setRecMII(CurMII);
1621 if (CurMII > RecMII)
1622 RecMII = CurMII;
1623 }
1624
1625 return RecMII;
1626}
1627
1628/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1629/// but we do this to find the circuits, and then change them back.
1630static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1632 for (SUnit &SU : SUnits) {
1633 for (SDep &Pred : SU.Preds)
1634 if (Pred.getKind() == SDep::Anti)
1635 DepsAdded.push_back(std::make_pair(&SU, Pred));
1636 }
1637 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1638 // Remove this anti dependency and add one in the reverse direction.
1639 SUnit *SU = P.first;
1640 SDep &D = P.second;
1641 SUnit *TargetSU = D.getSUnit();
1642 unsigned Reg = D.getReg();
1643 unsigned Lat = D.getLatency();
1644 SU->removePred(D);
1645 SDep Dep(SU, SDep::Anti, Reg);
1646 Dep.setLatency(Lat);
1647 TargetSU->addPred(Dep);
1648 }
1649}
1650
1651/// Create the adjacency structure of the nodes in the graph.
1652void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1653 SwingSchedulerDAG *DAG) {
1654 BitVector Added(SUnits.size());
1655 DenseMap<int, int> OutputDeps;
1656 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1657 Added.reset();
1658 // Add any successor to the adjacency matrix and exclude duplicates.
1659 for (auto &SI : SUnits[i].Succs) {
1660 // Only create a back-edge on the first and last nodes of a dependence
1661 // chain. This records any chains and adds them later.
1662 if (SI.getKind() == SDep::Output) {
1663 int N = SI.getSUnit()->NodeNum;
1664 int BackEdge = i;
1665 auto Dep = OutputDeps.find(BackEdge);
1666 if (Dep != OutputDeps.end()) {
1667 BackEdge = Dep->second;
1668 OutputDeps.erase(Dep);
1669 }
1670 OutputDeps[N] = BackEdge;
1671 }
1672 // Do not process a boundary node, an artificial node.
1673 // A back-edge is processed only if it goes to a Phi.
1674 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1675 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1676 continue;
1677 int N = SI.getSUnit()->NodeNum;
1678 if (!Added.test(N)) {
1679 AdjK[i].push_back(N);
1680 Added.set(N);
1681 }
1682 }
1683 // A chain edge between a store and a load is treated as a back-edge in the
1684 // adjacency matrix.
1685 for (auto &PI : SUnits[i].Preds) {
1686 if (!SUnits[i].getInstr()->mayStore() ||
1687 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1688 continue;
1689 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1690 int N = PI.getSUnit()->NodeNum;
1691 if (!Added.test(N)) {
1692 AdjK[i].push_back(N);
1693 Added.set(N);
1694 }
1695 }
1696 }
1697 }
1698 // Add back-edges in the adjacency matrix for the output dependences.
1699 for (auto &OD : OutputDeps)
1700 if (!Added.test(OD.second)) {
1701 AdjK[OD.first].push_back(OD.second);
1702 Added.set(OD.second);
1703 }
1704}
1705
1706/// Identify an elementary circuit in the dependence graph starting at the
1707/// specified node.
1708bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1709 bool HasBackedge) {
1710 SUnit *SV = &SUnits[V];
1711 bool F = false;
1712 Stack.insert(SV);
1713 Blocked.set(V);
1714
1715 for (auto W : AdjK[V]) {
1716 if (NumPaths > MaxPaths)
1717 break;
1718 if (W < S)
1719 continue;
1720 if (W == S) {
1721 if (!HasBackedge)
1722 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1723 F = true;
1724 ++NumPaths;
1725 break;
1726 } else if (!Blocked.test(W)) {
1727 if (circuit(W, S, NodeSets,
1728 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1729 F = true;
1730 }
1731 }
1732
1733 if (F)
1734 unblock(V);
1735 else {
1736 for (auto W : AdjK[V]) {
1737 if (W < S)
1738 continue;
1739 B[W].insert(SV);
1740 }
1741 }
1742 Stack.pop_back();
1743 return F;
1744}
1745
1746/// Unblock a node in the circuit finding algorithm.
1747void SwingSchedulerDAG::Circuits::unblock(int U) {
1748 Blocked.reset(U);
1750 while (!BU.empty()) {
1752 assert(SI != BU.end() && "Invalid B set.");
1753 SUnit *W = *SI;
1754 BU.erase(W);
1755 if (Blocked.test(W->NodeNum))
1756 unblock(W->NodeNum);
1757 }
1758}
1759
1760/// Identify all the elementary circuits in the dependence graph using
1761/// Johnson's circuit algorithm.
1762void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1763 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1764 // but we do this to find the circuits, and then change them back.
1766
1767 Circuits Cir(SUnits, Topo);
1768 // Create the adjacency structure.
1769 Cir.createAdjacencyStructure(this);
1770 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1771 Cir.reset();
1772 Cir.circuit(i, i, NodeSets);
1773 }
1774
1775 // Change the dependences back so that we've created a DAG again.
1777}
1778
1779// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1780// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1781// additional copies that are needed across iterations. An artificial dependence
1782// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1783
1784// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1785// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1786// PHI-------True-Dep------> USEOfPhi
1787
1788// The mutation creates
1789// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1790
1791// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1792// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1793// late to avoid additional copies across iterations. The possible scheduling
1794// order would be
1795// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1796
1797void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1798 for (SUnit &SU : DAG->SUnits) {
1799 // Find the COPY/REG_SEQUENCE instruction.
1800 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1801 continue;
1802
1803 // Record the loop carried PHIs.
1805 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1807
1808 for (auto &Dep : SU.Preds) {
1809 SUnit *TmpSU = Dep.getSUnit();
1810 MachineInstr *TmpMI = TmpSU->getInstr();
1811 SDep::Kind DepKind = Dep.getKind();
1812 // Save the loop carried PHI.
1813 if (DepKind == SDep::Anti && TmpMI->isPHI())
1814 PHISUs.push_back(TmpSU);
1815 // Save the source of COPY/REG_SEQUENCE.
1816 // If the source has no pre-decessors, we will end up creating cycles.
1817 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1818 SrcSUs.push_back(TmpSU);
1819 }
1820
1821 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1822 continue;
1823
1824 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1825 // SUnit to the container.
1827 // Do not use iterator based loop here as we are updating the container.
1828 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1829 for (auto &Dep : PHISUs[Index]->Succs) {
1830 if (Dep.getKind() != SDep::Data)
1831 continue;
1832
1833 SUnit *TmpSU = Dep.getSUnit();
1834 MachineInstr *TmpMI = TmpSU->getInstr();
1835 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1836 PHISUs.push_back(TmpSU);
1837 continue;
1838 }
1839 UseSUs.push_back(TmpSU);
1840 }
1841 }
1842
1843 if (UseSUs.size() == 0)
1844 continue;
1845
1846 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1847 // Add the artificial dependencies if it does not form a cycle.
1848 for (auto *I : UseSUs) {
1849 for (auto *Src : SrcSUs) {
1850 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1851 Src->addPred(SDep(I, SDep::Artificial));
1852 SDAG->Topo.AddPred(Src, I);
1853 }
1854 }
1855 }
1856 }
1857}
1858
1859/// Return true for DAG nodes that we ignore when computing the cost functions.
1860/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1861/// in the calculation of the ASAP, ALAP, etc functions.
1862static bool ignoreDependence(const SDep &D, bool isPred) {
1863 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1864 return true;
1865 return D.getKind() == SDep::Anti && isPred;
1866}
1867
1868/// Compute several functions need to order the nodes for scheduling.
1869/// ASAP - Earliest time to schedule a node.
1870/// ALAP - Latest time to schedule a node.
1871/// MOV - Mobility function, difference between ALAP and ASAP.
1872/// D - Depth of each node.
1873/// H - Height of each node.
1874void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1875 ScheduleInfo.resize(SUnits.size());
1876
1877 LLVM_DEBUG({
1878 for (int I : Topo) {
1879 const SUnit &SU = SUnits[I];
1880 dumpNode(SU);
1881 }
1882 });
1883
1884 int maxASAP = 0;
1885 // Compute ASAP and ZeroLatencyDepth.
1886 for (int I : Topo) {
1887 int asap = 0;
1888 int zeroLatencyDepth = 0;
1889 SUnit *SU = &SUnits[I];
1890 for (const SDep &P : SU->Preds) {
1891 SUnit *pred = P.getSUnit();
1892 if (P.getLatency() == 0)
1893 zeroLatencyDepth =
1894 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1895 if (ignoreDependence(P, true))
1896 continue;
1897 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1898 getDistance(pred, SU, P) * MII));
1899 }
1900 maxASAP = std::max(maxASAP, asap);
1901 ScheduleInfo[I].ASAP = asap;
1902 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1903 }
1904
1905 // Compute ALAP, ZeroLatencyHeight, and MOV.
1906 for (int I : llvm::reverse(Topo)) {
1907 int alap = maxASAP;
1908 int zeroLatencyHeight = 0;
1909 SUnit *SU = &SUnits[I];
1910 for (const SDep &S : SU->Succs) {
1911 SUnit *succ = S.getSUnit();
1912 if (succ->isBoundaryNode())
1913 continue;
1914 if (S.getLatency() == 0)
1915 zeroLatencyHeight =
1916 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1917 if (ignoreDependence(S, true))
1918 continue;
1919 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1920 getDistance(SU, succ, S) * MII));
1921 }
1922
1923 ScheduleInfo[I].ALAP = alap;
1924 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1925 }
1926
1927 // After computing the node functions, compute the summary for each node set.
1928 for (NodeSet &I : NodeSets)
1929 I.computeNodeSetInfo(this);
1930
1931 LLVM_DEBUG({
1932 for (unsigned i = 0; i < SUnits.size(); i++) {
1933 dbgs() << "\tNode " << i << ":\n";
1934 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1935 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1936 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1937 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1938 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1939 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1940 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1941 }
1942 });
1943}
1944
1945/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1946/// as the predecessors of the elements of NodeOrder that are not also in
1947/// NodeOrder.
1950 const NodeSet *S = nullptr) {
1951 Preds.clear();
1952 for (const SUnit *SU : NodeOrder) {
1953 for (const SDep &Pred : SU->Preds) {
1954 if (S && S->count(Pred.getSUnit()) == 0)
1955 continue;
1956 if (ignoreDependence(Pred, true))
1957 continue;
1958 if (NodeOrder.count(Pred.getSUnit()) == 0)
1959 Preds.insert(Pred.getSUnit());
1960 }
1961 // Back-edges are predecessors with an anti-dependence.
1962 for (const SDep &Succ : SU->Succs) {
1963 if (Succ.getKind() != SDep::Anti)
1964 continue;
1965 if (S && S->count(Succ.getSUnit()) == 0)
1966 continue;
1967 if (NodeOrder.count(Succ.getSUnit()) == 0)
1968 Preds.insert(Succ.getSUnit());
1969 }
1970 }
1971 return !Preds.empty();
1972}
1973
1974/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1975/// as the successors of the elements of NodeOrder that are not also in
1976/// NodeOrder.
1979 const NodeSet *S = nullptr) {
1980 Succs.clear();
1981 for (const SUnit *SU : NodeOrder) {
1982 for (const SDep &Succ : SU->Succs) {
1983 if (S && S->count(Succ.getSUnit()) == 0)
1984 continue;
1985 if (ignoreDependence(Succ, false))
1986 continue;
1987 if (NodeOrder.count(Succ.getSUnit()) == 0)
1988 Succs.insert(Succ.getSUnit());
1989 }
1990 for (const SDep &Pred : SU->Preds) {
1991 if (Pred.getKind() != SDep::Anti)
1992 continue;
1993 if (S && S->count(Pred.getSUnit()) == 0)
1994 continue;
1995 if (NodeOrder.count(Pred.getSUnit()) == 0)
1996 Succs.insert(Pred.getSUnit());
1997 }
1998 }
1999 return !Succs.empty();
2000}
2001
2002/// Return true if there is a path from the specified node to any of the nodes
2003/// in DestNodes. Keep track and return the nodes in any path.
2004static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2005 SetVector<SUnit *> &DestNodes,
2006 SetVector<SUnit *> &Exclude,
2007 SmallPtrSet<SUnit *, 8> &Visited) {
2008 if (Cur->isBoundaryNode())
2009 return false;
2010 if (Exclude.contains(Cur))
2011 return false;
2012 if (DestNodes.contains(Cur))
2013 return true;
2014 if (!Visited.insert(Cur).second)
2015 return Path.contains(Cur);
2016 bool FoundPath = false;
2017 for (auto &SI : Cur->Succs)
2018 if (!ignoreDependence(SI, false))
2019 FoundPath |=
2020 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
2021 for (auto &PI : Cur->Preds)
2022 if (PI.getKind() == SDep::Anti)
2023 FoundPath |=
2024 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
2025 if (FoundPath)
2026 Path.insert(Cur);
2027 return FoundPath;
2028}
2029
2030/// Compute the live-out registers for the instructions in a node-set.
2031/// The live-out registers are those that are defined in the node-set,
2032/// but not used. Except for use operands of Phis.
2034 NodeSet &NS) {
2039 for (SUnit *SU : NS) {
2040 const MachineInstr *MI = SU->getInstr();
2041 if (MI->isPHI())
2042 continue;
2043 for (const MachineOperand &MO : MI->all_uses()) {
2044 Register Reg = MO.getReg();
2045 if (Reg.isVirtual())
2046 Uses.insert(Reg);
2047 else if (MRI.isAllocatable(Reg))
2048 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2049 Uses.insert(Unit);
2050 }
2051 }
2052 for (SUnit *SU : NS)
2053 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2054 if (!MO.isDead()) {
2055 Register Reg = MO.getReg();
2056 if (Reg.isVirtual()) {
2057 if (!Uses.count(Reg))
2058 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2060 } else if (MRI.isAllocatable(Reg)) {
2061 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2062 if (!Uses.count(Unit))
2063 LiveOutRegs.push_back(
2065 }
2066 }
2067 RPTracker.addLiveRegs(LiveOutRegs);
2068}
2069
2070/// A heuristic to filter nodes in recurrent node-sets if the register
2071/// pressure of a set is too high.
2072void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2073 for (auto &NS : NodeSets) {
2074 // Skip small node-sets since they won't cause register pressure problems.
2075 if (NS.size() <= 2)
2076 continue;
2077 IntervalPressure RecRegPressure;
2078 RegPressureTracker RecRPTracker(RecRegPressure);
2079 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2080 computeLiveOuts(MF, RecRPTracker, NS);
2081 RecRPTracker.closeBottom();
2082
2083 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2084 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2085 return A->NodeNum > B->NodeNum;
2086 });
2087
2088 for (auto &SU : SUnits) {
2089 // Since we're computing the register pressure for a subset of the
2090 // instructions in a block, we need to set the tracker for each
2091 // instruction in the node-set. The tracker is set to the instruction
2092 // just after the one we're interested in.
2094 RecRPTracker.setPos(std::next(CurInstI));
2095
2096 RegPressureDelta RPDelta;
2097 ArrayRef<PressureChange> CriticalPSets;
2098 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2099 CriticalPSets,
2100 RecRegPressure.MaxSetPressure);
2101 if (RPDelta.Excess.isValid()) {
2102 LLVM_DEBUG(
2103 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2105 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2106 NS.setExceedPressure(SU);
2107 break;
2108 }
2109 RecRPTracker.recede();
2110 }
2111 }
2112}
2113
2114/// A heuristic to colocate node sets that have the same set of
2115/// successors.
2116void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2117 unsigned Colocate = 0;
2118 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2119 NodeSet &N1 = NodeSets[i];
2121 if (N1.empty() || !succ_L(N1, S1))
2122 continue;
2123 for (int j = i + 1; j < e; ++j) {
2124 NodeSet &N2 = NodeSets[j];
2125 if (N1.compareRecMII(N2) != 0)
2126 continue;
2128 if (N2.empty() || !succ_L(N2, S2))
2129 continue;
2130 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2131 N1.setColocate(++Colocate);
2132 N2.setColocate(Colocate);
2133 break;
2134 }
2135 }
2136 }
2137}
2138
2139/// Check if the existing node-sets are profitable. If not, then ignore the
2140/// recurrent node-sets, and attempt to schedule all nodes together. This is
2141/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2142/// then it's best to try to schedule all instructions together instead of
2143/// starting with the recurrent node-sets.
2144void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2145 // Look for loops with a large MII.
2146 if (MII < 17)
2147 return;
2148 // Check if the node-set contains only a simple add recurrence.
2149 for (auto &NS : NodeSets) {
2150 if (NS.getRecMII() > 2)
2151 return;
2152 if (NS.getMaxDepth() > MII)
2153 return;
2154 }
2155 NodeSets.clear();
2156 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2157}
2158
2159/// Add the nodes that do not belong to a recurrence set into groups
2160/// based upon connected components.
2161void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2162 SetVector<SUnit *> NodesAdded;
2164 // Add the nodes that are on a path between the previous node sets and
2165 // the current node set.
2166 for (NodeSet &I : NodeSets) {
2168 // Add the nodes from the current node set to the previous node set.
2169 if (succ_L(I, N)) {
2171 for (SUnit *NI : N) {
2172 Visited.clear();
2173 computePath(NI, Path, NodesAdded, I, Visited);
2174 }
2175 if (!Path.empty())
2176 I.insert(Path.begin(), Path.end());
2177 }
2178 // Add the nodes from the previous node set to the current node set.
2179 N.clear();
2180 if (succ_L(NodesAdded, N)) {
2182 for (SUnit *NI : N) {
2183 Visited.clear();
2184 computePath(NI, Path, I, NodesAdded, Visited);
2185 }
2186 if (!Path.empty())
2187 I.insert(Path.begin(), Path.end());
2188 }
2189 NodesAdded.insert(I.begin(), I.end());
2190 }
2191
2192 // Create a new node set with the connected nodes of any successor of a node
2193 // in a recurrent set.
2194 NodeSet NewSet;
2196 if (succ_L(NodesAdded, N))
2197 for (SUnit *I : N)
2198 addConnectedNodes(I, NewSet, NodesAdded);
2199 if (!NewSet.empty())
2200 NodeSets.push_back(NewSet);
2201
2202 // Create a new node set with the connected nodes of any predecessor of a node
2203 // in a recurrent set.
2204 NewSet.clear();
2205 if (pred_L(NodesAdded, N))
2206 for (SUnit *I : N)
2207 addConnectedNodes(I, NewSet, NodesAdded);
2208 if (!NewSet.empty())
2209 NodeSets.push_back(NewSet);
2210
2211 // Create new nodes sets with the connected nodes any remaining node that
2212 // has no predecessor.
2213 for (SUnit &SU : SUnits) {
2214 if (NodesAdded.count(&SU) == 0) {
2215 NewSet.clear();
2216 addConnectedNodes(&SU, NewSet, NodesAdded);
2217 if (!NewSet.empty())
2218 NodeSets.push_back(NewSet);
2219 }
2220 }
2221}
2222
2223/// Add the node to the set, and add all of its connected nodes to the set.
2224void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2225 SetVector<SUnit *> &NodesAdded) {
2226 NewSet.insert(SU);
2227 NodesAdded.insert(SU);
2228 for (auto &SI : SU->Succs) {
2229 SUnit *Successor = SI.getSUnit();
2230 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2231 NodesAdded.count(Successor) == 0)
2232 addConnectedNodes(Successor, NewSet, NodesAdded);
2233 }
2234 for (auto &PI : SU->Preds) {
2235 SUnit *Predecessor = PI.getSUnit();
2236 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2237 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2238 }
2239}
2240
2241/// Return true if Set1 contains elements in Set2. The elements in common
2242/// are returned in a different container.
2243static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2245 Result.clear();
2246 for (SUnit *SU : Set1) {
2247 if (Set2.count(SU) != 0)
2248 Result.insert(SU);
2249 }
2250 return !Result.empty();
2251}
2252
2253/// Merge the recurrence node sets that have the same initial node.
2254void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2255 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2256 ++I) {
2257 NodeSet &NI = *I;
2258 for (NodeSetType::iterator J = I + 1; J != E;) {
2259 NodeSet &NJ = *J;
2260 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2261 if (NJ.compareRecMII(NI) > 0)
2262 NI.setRecMII(NJ.getRecMII());
2263 for (SUnit *SU : *J)
2264 I->insert(SU);
2265 NodeSets.erase(J);
2266 E = NodeSets.end();
2267 } else {
2268 ++J;
2269 }
2270 }
2271 }
2272}
2273
2274/// Remove nodes that have been scheduled in previous NodeSets.
2275void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2276 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2277 ++I)
2278 for (NodeSetType::iterator J = I + 1; J != E;) {
2279 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2280
2281 if (J->empty()) {
2282 NodeSets.erase(J);
2283 E = NodeSets.end();
2284 } else {
2285 ++J;
2286 }
2287 }
2288}
2289
2290/// Compute an ordered list of the dependence graph nodes, which
2291/// indicates the order that the nodes will be scheduled. This is a
2292/// two-level algorithm. First, a partial order is created, which
2293/// consists of a list of sets ordered from highest to lowest priority.
2294void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2296 NodeOrder.clear();
2297
2298 for (auto &Nodes : NodeSets) {
2299 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2300 OrderKind Order;
2302 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2303 R.insert(N.begin(), N.end());
2304 Order = BottomUp;
2305 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2306 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2307 R.insert(N.begin(), N.end());
2308 Order = TopDown;
2309 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2310 } else if (isIntersect(N, Nodes, R)) {
2311 // If some of the successors are in the existing node-set, then use the
2312 // top-down ordering.
2313 Order = TopDown;
2314 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2315 } else if (NodeSets.size() == 1) {
2316 for (const auto &N : Nodes)
2317 if (N->Succs.size() == 0)
2318 R.insert(N);
2319 Order = BottomUp;
2320 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2321 } else {
2322 // Find the node with the highest ASAP.
2323 SUnit *maxASAP = nullptr;
2324 for (SUnit *SU : Nodes) {
2325 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2326 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2327 maxASAP = SU;
2328 }
2329 R.insert(maxASAP);
2330 Order = BottomUp;
2331 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2332 }
2333
2334 while (!R.empty()) {
2335 if (Order == TopDown) {
2336 // Choose the node with the maximum height. If more than one, choose
2337 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2338 // choose the node with the lowest MOV.
2339 while (!R.empty()) {
2340 SUnit *maxHeight = nullptr;
2341 for (SUnit *I : R) {
2342 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2343 maxHeight = I;
2344 else if (getHeight(I) == getHeight(maxHeight) &&
2346 maxHeight = I;
2347 else if (getHeight(I) == getHeight(maxHeight) &&
2349 getZeroLatencyHeight(maxHeight) &&
2350 getMOV(I) < getMOV(maxHeight))
2351 maxHeight = I;
2352 }
2353 NodeOrder.insert(maxHeight);
2354 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2355 R.remove(maxHeight);
2356 for (const auto &I : maxHeight->Succs) {
2357 if (Nodes.count(I.getSUnit()) == 0)
2358 continue;
2359 if (NodeOrder.contains(I.getSUnit()))
2360 continue;
2361 if (ignoreDependence(I, false))
2362 continue;
2363 R.insert(I.getSUnit());
2364 }
2365 // Back-edges are predecessors with an anti-dependence.
2366 for (const auto &I : maxHeight->Preds) {
2367 if (I.getKind() != SDep::Anti)
2368 continue;
2369 if (Nodes.count(I.getSUnit()) == 0)
2370 continue;
2371 if (NodeOrder.contains(I.getSUnit()))
2372 continue;
2373 R.insert(I.getSUnit());
2374 }
2375 }
2376 Order = BottomUp;
2377 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2379 if (pred_L(NodeOrder, N, &Nodes))
2380 R.insert(N.begin(), N.end());
2381 } else {
2382 // Choose the node with the maximum depth. If more than one, choose
2383 // the node with the maximum ZeroLatencyDepth. If still more than one,
2384 // choose the node with the lowest MOV.
2385 while (!R.empty()) {
2386 SUnit *maxDepth = nullptr;
2387 for (SUnit *I : R) {
2388 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2389 maxDepth = I;
2390 else if (getDepth(I) == getDepth(maxDepth) &&
2392 maxDepth = I;
2393 else if (getDepth(I) == getDepth(maxDepth) &&
2395 getMOV(I) < getMOV(maxDepth))
2396 maxDepth = I;
2397 }
2398 NodeOrder.insert(maxDepth);
2399 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2400 R.remove(maxDepth);
2401 if (Nodes.isExceedSU(maxDepth)) {
2402 Order = TopDown;
2403 R.clear();
2404 R.insert(Nodes.getNode(0));
2405 break;
2406 }
2407 for (const auto &I : maxDepth->Preds) {
2408 if (Nodes.count(I.getSUnit()) == 0)
2409 continue;
2410 if (NodeOrder.contains(I.getSUnit()))
2411 continue;
2412 R.insert(I.getSUnit());
2413 }
2414 // Back-edges are predecessors with an anti-dependence.
2415 for (const auto &I : maxDepth->Succs) {
2416 if (I.getKind() != SDep::Anti)
2417 continue;
2418 if (Nodes.count(I.getSUnit()) == 0)
2419 continue;
2420 if (NodeOrder.contains(I.getSUnit()))
2421 continue;
2422 R.insert(I.getSUnit());
2423 }
2424 }
2425 Order = TopDown;
2426 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2428 if (succ_L(NodeOrder, N, &Nodes))
2429 R.insert(N.begin(), N.end());
2430 }
2431 }
2432 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2433 }
2434
2435 LLVM_DEBUG({
2436 dbgs() << "Node order: ";
2437 for (SUnit *I : NodeOrder)
2438 dbgs() << " " << I->NodeNum << " ";
2439 dbgs() << "\n";
2440 });
2441}
2442
2443/// Process the nodes in the computed order and create the pipelined schedule
2444/// of the instructions, if possible. Return true if a schedule is found.
2445bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2446
2447 if (NodeOrder.empty()){
2448 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2449 return false;
2450 }
2451
2452 bool scheduleFound = false;
2453 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2454 if (LimitRegPressure) {
2455 HRPDetector =
2456 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2457 HRPDetector->init(RegClassInfo);
2458 }
2459 // Keep increasing II until a valid schedule is found.
2460 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2461 Schedule.reset();
2462 Schedule.setInitiationInterval(II);
2463 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2464
2465 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2466 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2467 do {
2468 SUnit *SU = *NI;
2469
2470 // Compute the schedule time for the instruction, which is based
2471 // upon the scheduled time for any predecessors/successors.
2472 int EarlyStart = INT_MIN;
2473 int LateStart = INT_MAX;
2474 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2475 LLVM_DEBUG({
2476 dbgs() << "\n";
2477 dbgs() << "Inst (" << SU->NodeNum << ") ";
2478 SU->getInstr()->dump();
2479 dbgs() << "\n";
2480 });
2481 LLVM_DEBUG(
2482 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2483
2484 if (EarlyStart > LateStart)
2485 scheduleFound = false;
2486 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2487 scheduleFound =
2488 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2489 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2490 scheduleFound =
2491 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2492 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2493 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2494 // When scheduling a Phi it is better to start at the late cycle and
2495 // go backwards. The default order may insert the Phi too far away
2496 // from its first dependence.
2497 // Also, do backward search when all scheduled predecessors are
2498 // loop-carried output/order dependencies. Empirically, there are also
2499 // cases where scheduling becomes possible with backward search.
2500 if (SU->getInstr()->isPHI() ||
2501 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this))
2502 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2503 else
2504 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2505 } else {
2506 int FirstCycle = Schedule.getFirstCycle();
2507 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2508 FirstCycle + getASAP(SU) + II - 1, II);
2509 }
2510
2511 // Even if we find a schedule, make sure the schedule doesn't exceed the
2512 // allowable number of stages. We keep trying if this happens.
2513 if (scheduleFound)
2514 if (SwpMaxStages > -1 &&
2515 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2516 scheduleFound = false;
2517
2518 LLVM_DEBUG({
2519 if (!scheduleFound)
2520 dbgs() << "\tCan't schedule\n";
2521 });
2522 } while (++NI != NE && scheduleFound);
2523
2524 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2525 if (scheduleFound)
2526 scheduleFound =
2527 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2528
2529 // If a schedule is found, check if it is a valid schedule too.
2530 if (scheduleFound)
2531 scheduleFound = Schedule.isValidSchedule(this);
2532
2533 // If a schedule was found and the option is enabled, check if the schedule
2534 // might generate additional register spills/fills.
2535 if (scheduleFound && LimitRegPressure)
2536 scheduleFound =
2537 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2538 }
2539
2540 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2541 << " (II=" << Schedule.getInitiationInterval()
2542 << ")\n");
2543
2544 if (scheduleFound) {
2545 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2546 if (!scheduleFound)
2547 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2548 }
2549
2550 if (scheduleFound) {
2551 Schedule.finalizeSchedule(this);
2552 Pass.ORE->emit([&]() {
2554 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2555 << "Schedule found with Initiation Interval: "
2556 << ore::NV("II", Schedule.getInitiationInterval())
2557 << ", MaxStageCount: "
2558 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2559 });
2560 } else
2561 Schedule.reset();
2562
2563 return scheduleFound && Schedule.getMaxStageCount() > 0;
2564}
2565
2566/// Return true if we can compute the amount the instruction changes
2567/// during each iteration. Set Delta to the amount of the change.
2568bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2570 const MachineOperand *BaseOp;
2571 int64_t Offset;
2572 bool OffsetIsScalable;
2573 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2574 return false;
2575
2576 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2577 if (OffsetIsScalable)
2578 return false;
2579
2580 if (!BaseOp->isReg())
2581 return false;
2582
2583 Register BaseReg = BaseOp->getReg();
2584
2586 // Check if there is a Phi. If so, get the definition in the loop.
2587 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2588 if (BaseDef && BaseDef->isPHI()) {
2589 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2590 BaseDef = MRI.getVRegDef(BaseReg);
2591 }
2592 if (!BaseDef)
2593 return false;
2594
2595 int D = 0;
2596 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2597 return false;
2598
2599 Delta = D;
2600 return true;
2601}
2602
2603/// Check if we can change the instruction to use an offset value from the
2604/// previous iteration. If so, return true and set the base and offset values
2605/// so that we can rewrite the load, if necessary.
2606/// v1 = Phi(v0, v3)
2607/// v2 = load v1, 0
2608/// v3 = post_store v1, 4, x
2609/// This function enables the load to be rewritten as v2 = load v3, 4.
2610bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2611 unsigned &BasePos,
2612 unsigned &OffsetPos,
2613 unsigned &NewBase,
2614 int64_t &Offset) {
2615 // Get the load instruction.
2616 if (TII->isPostIncrement(*MI))
2617 return false;
2618 unsigned BasePosLd, OffsetPosLd;
2619 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2620 return false;
2621 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2622
2623 // Look for the Phi instruction.
2624 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2625 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2626 if (!Phi || !Phi->isPHI())
2627 return false;
2628 // Get the register defined in the loop block.
2629 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2630 if (!PrevReg)
2631 return false;
2632
2633 // Check for the post-increment load/store instruction.
2634 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2635 if (!PrevDef || PrevDef == MI)
2636 return false;
2637
2638 if (!TII->isPostIncrement(*PrevDef))
2639 return false;
2640
2641 unsigned BasePos1 = 0, OffsetPos1 = 0;
2642 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2643 return false;
2644
2645 // Make sure that the instructions do not access the same memory location in
2646 // the next iteration.
2647 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2648 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2650 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2651 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2652 MF.deleteMachineInstr(NewMI);
2653 if (!Disjoint)
2654 return false;
2655
2656 // Set the return value once we determine that we return true.
2657 BasePos = BasePosLd;
2658 OffsetPos = OffsetPosLd;
2659 NewBase = PrevReg;
2660 Offset = StoreOffset;
2661 return true;
2662}
2663
2664/// Apply changes to the instruction if needed. The changes are need
2665/// to improve the scheduling and depend up on the final schedule.
2667 SMSchedule &Schedule) {
2668 SUnit *SU = getSUnit(MI);
2670 InstrChanges.find(SU);
2671 if (It != InstrChanges.end()) {
2672 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2673 unsigned BasePos, OffsetPos;
2674 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2675 return;
2676 Register BaseReg = MI->getOperand(BasePos).getReg();
2677 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2678 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2679 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2680 int BaseStageNum = Schedule.stageScheduled(SU);
2681 int BaseCycleNum = Schedule.cycleScheduled(SU);
2682 if (BaseStageNum < DefStageNum) {
2684 int OffsetDiff = DefStageNum - BaseStageNum;
2685 if (DefCycleNum < BaseCycleNum) {
2686 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2687 if (OffsetDiff > 0)
2688 --OffsetDiff;
2689 }
2690 int64_t NewOffset =
2691 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2692 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2693 SU->setInstr(NewMI);
2694 MISUnitMap[NewMI] = SU;
2695 NewMIs[MI] = NewMI;
2696 }
2697 }
2698}
2699
2700/// Return the instruction in the loop that defines the register.
2701/// If the definition is a Phi, then follow the Phi operand to
2702/// the instruction in the loop.
2703MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2705 MachineInstr *Def = MRI.getVRegDef(Reg);
2706 while (Def->isPHI()) {
2707 if (!Visited.insert(Def).second)
2708 break;
2709 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2710 if (Def->getOperand(i + 1).getMBB() == BB) {
2711 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2712 break;
2713 }
2714 }
2715 return Def;
2716}
2717
2718/// Return true for an order or output dependence that is loop carried
2719/// potentially. A dependence is loop carried if the destination defines a value
2720/// that may be used or defined by the source in a subsequent iteration.
2722 bool isSucc) {
2723 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2724 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2725 return false;
2726
2728 return true;
2729
2730 if (Dep.getKind() == SDep::Output)
2731 return true;
2732
2733 MachineInstr *SI = Source->getInstr();
2734 MachineInstr *DI = Dep.getSUnit()->getInstr();
2735 if (!isSucc)
2736 std::swap(SI, DI);
2737 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2738
2739 // Assume ordered loads and stores may have a loop carried dependence.
2740 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2741 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2742 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2743 return true;
2744
2745 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2746 return false;
2747
2748 // The conservative assumption is that a dependence between memory operations
2749 // may be loop carried. The following code checks when it can be proved that
2750 // there is no loop carried dependence.
2751 unsigned DeltaS, DeltaD;
2752 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2753 return true;
2754
2755 const MachineOperand *BaseOpS, *BaseOpD;
2756 int64_t OffsetS, OffsetD;
2757 bool OffsetSIsScalable, OffsetDIsScalable;
2759 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2760 TRI) ||
2761 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2762 TRI))
2763 return true;
2764
2765 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2766 "Expected offsets to be byte offsets");
2767
2768 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2769 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2770 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2771 return true;
2772
2773 unsigned InitValS = 0;
2774 unsigned LoopValS = 0;
2775 unsigned InitValD = 0;
2776 unsigned LoopValD = 0;
2777 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2778 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2779 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2780 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2781
2782 if (!InitDefS->isIdenticalTo(*InitDefD))
2783 return true;
2784
2785 // Check that the base register is incremented by a constant value for each
2786 // iteration.
2787 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2788 int D = 0;
2789 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2790 return true;
2791
2792 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2793 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2794
2795 // This is the main test, which checks the offset values and the loop
2796 // increment value to determine if the accesses may be loop carried.
2797 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2798 return true;
2799
2800 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2801 DeltaD < AccessSizeD.getValue())
2802 return true;
2803
2804 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2805 OffsetD + (int64_t)AccessSizeD.getValue());
2806}
2807
2808void SwingSchedulerDAG::postProcessDAG() {
2809 for (auto &M : Mutations)
2810 M->apply(this);
2811}
2812
2813/// Try to schedule the node at the specified StartCycle and continue
2814/// until the node is schedule or the EndCycle is reached. This function
2815/// returns true if the node is scheduled. This routine may search either
2816/// forward or backward for a place to insert the instruction based upon
2817/// the relative values of StartCycle and EndCycle.
2818bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2819 bool forward = true;
2820 LLVM_DEBUG({
2821 dbgs() << "Trying to insert node between " << StartCycle << " and "
2822 << EndCycle << " II: " << II << "\n";
2823 });
2824 if (StartCycle > EndCycle)
2825 forward = false;
2826
2827 // The terminating condition depends on the direction.
2828 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2829 for (int curCycle = StartCycle; curCycle != termCycle;
2830 forward ? ++curCycle : --curCycle) {
2831
2832 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2833 ProcItinResources.canReserveResources(*SU, curCycle)) {
2834 LLVM_DEBUG({
2835 dbgs() << "\tinsert at cycle " << curCycle << " ";
2836 SU->getInstr()->dump();
2837 });
2838
2839 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2840 ProcItinResources.reserveResources(*SU, curCycle);
2841 ScheduledInstrs[curCycle].push_back(SU);
2842 InstrToCycle.insert(std::make_pair(SU, curCycle));
2843 if (curCycle > LastCycle)
2844 LastCycle = curCycle;
2845 if (curCycle < FirstCycle)
2846 FirstCycle = curCycle;
2847 return true;
2848 }
2849 LLVM_DEBUG({
2850 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2851 SU->getInstr()->dump();
2852 });
2853 }
2854 return false;
2855}
2856
2857// Return the cycle of the earliest scheduled instruction in the chain.
2860 SmallVector<SDep, 8> Worklist;
2861 Worklist.push_back(Dep);
2862 int EarlyCycle = INT_MAX;
2863 while (!Worklist.empty()) {
2864 const SDep &Cur = Worklist.pop_back_val();
2865 SUnit *PrevSU = Cur.getSUnit();
2866 if (Visited.count(PrevSU))
2867 continue;
2868 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2869 if (it == InstrToCycle.end())
2870 continue;
2871 EarlyCycle = std::min(EarlyCycle, it->second);
2872 for (const auto &PI : PrevSU->Preds)
2873 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2874 Worklist.push_back(PI);
2875 Visited.insert(PrevSU);
2876 }
2877 return EarlyCycle;
2878}
2879
2880// Return the cycle of the latest scheduled instruction in the chain.
2883 SmallVector<SDep, 8> Worklist;
2884 Worklist.push_back(Dep);
2885 int LateCycle = INT_MIN;
2886 while (!Worklist.empty()) {
2887 const SDep &Cur = Worklist.pop_back_val();
2888 SUnit *SuccSU = Cur.getSUnit();
2889 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2890 continue;
2891 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2892 if (it == InstrToCycle.end())
2893 continue;
2894 LateCycle = std::max(LateCycle, it->second);
2895 for (const auto &SI : SuccSU->Succs)
2896 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2897 Worklist.push_back(SI);
2898 Visited.insert(SuccSU);
2899 }
2900 return LateCycle;
2901}
2902
2903/// If an instruction has a use that spans multiple iterations, then
2904/// return true. These instructions are characterized by having a back-ege
2905/// to a Phi, which contains a reference to another Phi.
2907 for (auto &P : SU->Preds)
2908 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2909 for (auto &S : P.getSUnit()->Succs)
2910 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2911 return P.getSUnit();
2912 return nullptr;
2913}
2914
2915/// Compute the scheduling start slot for the instruction. The start slot
2916/// depends on any predecessor or successor nodes scheduled already.
2917void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2918 int II, SwingSchedulerDAG *DAG) {
2919 // Iterate over each instruction that has been scheduled already. The start
2920 // slot computation depends on whether the previously scheduled instruction
2921 // is a predecessor or successor of the specified instruction.
2922 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2923
2924 // Iterate over each instruction in the current cycle.
2925 for (SUnit *I : getInstructions(cycle)) {
2926 // Because we're processing a DAG for the dependences, we recognize
2927 // the back-edge in recurrences by anti dependences.
2928 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2929 const SDep &Dep = SU->Preds[i];
2930 if (Dep.getSUnit() == I) {
2931 if (!DAG->isBackedge(SU, Dep)) {
2932 int EarlyStart = cycle + Dep.getLatency() -
2933 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2934 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2935 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2936 int End = earliestCycleInChain(Dep) + (II - 1);
2937 *MinLateStart = std::min(*MinLateStart, End);
2938 }
2939 } else {
2940 int LateStart = cycle - Dep.getLatency() +
2941 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2942 *MinLateStart = std::min(*MinLateStart, LateStart);
2943 }
2944 }
2945 // For instruction that requires multiple iterations, make sure that
2946 // the dependent instruction is not scheduled past the definition.
2947 SUnit *BE = multipleIterations(I, DAG);
2948 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2949 !SU->isPred(I))
2950 *MinLateStart = std::min(*MinLateStart, cycle);
2951 }
2952 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2953 if (SU->Succs[i].getSUnit() == I) {
2954 const SDep &Dep = SU->Succs[i];
2955 if (!DAG->isBackedge(SU, Dep)) {
2956 int LateStart = cycle - Dep.getLatency() +
2957 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2958 *MinLateStart = std::min(*MinLateStart, LateStart);
2959 if (DAG->isLoopCarriedDep(SU, Dep)) {
2960 int Start = latestCycleInChain(Dep) + 1 - II;
2961 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
2962 }
2963 } else {
2964 int EarlyStart = cycle + Dep.getLatency() -
2965 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2966 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2967 }
2968 }
2969 }
2970 }
2971 }
2972}
2973
2974/// Order the instructions within a cycle so that the definitions occur
2975/// before the uses. Returns true if the instruction is added to the start
2976/// of the list, or false if added to the end.
2978 std::deque<SUnit *> &Insts) const {
2979 MachineInstr *MI = SU->getInstr();
2980 bool OrderBeforeUse = false;
2981 bool OrderAfterDef = false;
2982 bool OrderBeforeDef = false;
2983 unsigned MoveDef = 0;
2984 unsigned MoveUse = 0;
2985 int StageInst1 = stageScheduled(SU);
2986
2987 unsigned Pos = 0;
2988 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2989 ++I, ++Pos) {
2990 for (MachineOperand &MO : MI->operands()) {
2991 if (!MO.isReg() || !MO.getReg().isVirtual())
2992 continue;
2993
2994 Register Reg = MO.getReg();
2995 unsigned BasePos, OffsetPos;
2996 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2997 if (MI->getOperand(BasePos).getReg() == Reg)
2998 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2999 Reg = NewReg;
3000 bool Reads, Writes;
3001 std::tie(Reads, Writes) =
3002 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3003 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3004 OrderBeforeUse = true;
3005 if (MoveUse == 0)
3006 MoveUse = Pos;
3007 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3008 // Add the instruction after the scheduled instruction.
3009 OrderAfterDef = true;
3010 MoveDef = Pos;
3011 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3012 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3013 OrderBeforeUse = true;
3014 if (MoveUse == 0)
3015 MoveUse = Pos;
3016 } else {
3017 OrderAfterDef = true;
3018 MoveDef = Pos;
3019 }
3020 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3021 OrderBeforeUse = true;
3022 if (MoveUse == 0)
3023 MoveUse = Pos;
3024 if (MoveUse != 0) {
3025 OrderAfterDef = true;
3026 MoveDef = Pos - 1;
3027 }
3028 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3029 // Add the instruction before the scheduled instruction.
3030 OrderBeforeUse = true;
3031 if (MoveUse == 0)
3032 MoveUse = Pos;
3033 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3034 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3035 if (MoveUse == 0) {
3036 OrderBeforeDef = true;
3037 MoveUse = Pos;
3038 }
3039 }
3040 }
3041 // Check for order dependences between instructions. Make sure the source
3042 // is ordered before the destination.
3043 for (auto &S : SU->Succs) {
3044 if (S.getSUnit() != *I)
3045 continue;
3046 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3047 OrderBeforeUse = true;
3048 if (Pos < MoveUse)
3049 MoveUse = Pos;
3050 }
3051 // We did not handle HW dependences in previous for loop,
3052 // and we normally set Latency = 0 for Anti/Output deps,
3053 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3054 else if ((S.getKind() == SDep::Anti || S.getKind() == SDep::Output) &&
3055 stageScheduled(*I) == StageInst1) {
3056 OrderBeforeUse = true;
3057 if ((MoveUse == 0) || (Pos < MoveUse))
3058 MoveUse = Pos;
3059 }
3060 }
3061 for (auto &P : SU->Preds) {
3062 if (P.getSUnit() != *I)
3063 continue;
3064 if ((P.getKind() == SDep::Order || P.getKind() == SDep::Anti ||
3065 P.getKind() == SDep::Output) &&
3066 stageScheduled(*I) == StageInst1) {
3067 OrderAfterDef = true;
3068 MoveDef = Pos;
3069 }
3070 }
3071 }
3072
3073 // A circular dependence.
3074 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3075 OrderBeforeUse = false;
3076
3077 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3078 // to a loop-carried dependence.
3079 if (OrderBeforeDef)
3080 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3081
3082 // The uncommon case when the instruction order needs to be updated because
3083 // there is both a use and def.
3084 if (OrderBeforeUse && OrderAfterDef) {
3085 SUnit *UseSU = Insts.at(MoveUse);
3086 SUnit *DefSU = Insts.at(MoveDef);
3087 if (MoveUse > MoveDef) {
3088 Insts.erase(Insts.begin() + MoveUse);
3089 Insts.erase(Insts.begin() + MoveDef);
3090 } else {
3091 Insts.erase(Insts.begin() + MoveDef);
3092 Insts.erase(Insts.begin() + MoveUse);
3093 }
3094 orderDependence(SSD, UseSU, Insts);
3095 orderDependence(SSD, SU, Insts);
3096 orderDependence(SSD, DefSU, Insts);
3097 return;
3098 }
3099 // Put the new instruction first if there is a use in the list. Otherwise,
3100 // put it at the end of the list.
3101 if (OrderBeforeUse)
3102 Insts.push_front(SU);
3103 else
3104 Insts.push_back(SU);
3105}
3106
3107/// Return true if the scheduled Phi has a loop carried operand.
3109 MachineInstr &Phi) const {
3110 if (!Phi.isPHI())
3111 return false;
3112 assert(Phi.isPHI() && "Expecting a Phi.");
3113 SUnit *DefSU = SSD->getSUnit(&Phi);
3114 unsigned DefCycle = cycleScheduled(DefSU);
3115 int DefStage = stageScheduled(DefSU);
3116
3117 unsigned InitVal = 0;
3118 unsigned LoopVal = 0;
3119 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3120 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3121 if (!UseSU)
3122 return true;
3123 if (UseSU->getInstr()->isPHI())
3124 return true;
3125 unsigned LoopCycle = cycleScheduled(UseSU);
3126 int LoopStage = stageScheduled(UseSU);
3127 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3128}
3129
3130/// Return true if the instruction is a definition that is loop carried
3131/// and defines the use on the next iteration.
3132/// v1 = phi(v2, v3)
3133/// (Def) v3 = op v1
3134/// (MO) = v1
3135/// If MO appears before Def, then v1 and v3 may get assigned to the same
3136/// register.
3138 MachineInstr *Def,
3139 MachineOperand &MO) const {
3140 if (!MO.isReg())
3141 return false;
3142 if (Def->isPHI())
3143 return false;
3144 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3145 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3146 return false;
3147 if (!isLoopCarried(SSD, *Phi))
3148 return false;
3149 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3150 for (MachineOperand &DMO : Def->all_defs()) {
3151 if (DMO.getReg() == LoopReg)
3152 return true;
3153 }
3154 return false;
3155}
3156
3157/// Return true if all scheduled predecessors are loop-carried output/order
3158/// dependencies.
3160 SUnit *SU, SwingSchedulerDAG *DAG) const {
3161 for (const SDep &Pred : SU->Preds)
3162 if (InstrToCycle.count(Pred.getSUnit()) && !DAG->isBackedge(SU, Pred))
3163 return false;
3164 for (const SDep &Succ : SU->Succs)
3165 if (InstrToCycle.count(Succ.getSUnit()) && DAG->isBackedge(SU, Succ))
3166 return false;
3167 return true;
3168}
3169
3170/// Determine transitive dependences of unpipelineable instructions
3173 SmallSet<SUnit *, 8> DoNotPipeline;
3174 SmallVector<SUnit *, 8> Worklist;
3175
3176 for (auto &SU : SSD->SUnits)
3177 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3178 Worklist.push_back(&SU);
3179
3180 while (!Worklist.empty()) {
3181 auto SU = Worklist.pop_back_val();
3182 if (DoNotPipeline.count(SU))
3183 continue;
3184 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3185 DoNotPipeline.insert(SU);
3186 for (auto &Dep : SU->Preds)
3187 Worklist.push_back(Dep.getSUnit());
3188 if (SU->getInstr()->isPHI())
3189 for (auto &Dep : SU->Succs)
3190 if (Dep.getKind() == SDep::Anti)
3191 Worklist.push_back(Dep.getSUnit());
3192 }
3193 return DoNotPipeline;
3194}
3195
3196// Determine all instructions upon which any unpipelineable instruction depends
3197// and ensure that they are in stage 0. If unable to do so, return false.
3201
3202 int NewLastCycle = INT_MIN;
3203 for (SUnit &SU : SSD->SUnits) {
3204 if (!SU.isInstr())
3205 continue;
3206 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3207 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3208 continue;
3209 }
3210
3211 // Put the non-pipelined instruction as early as possible in the schedule
3212 int NewCycle = getFirstCycle();
3213 for (auto &Dep : SU.Preds)
3214 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3215
3216 int OldCycle = InstrToCycle[&SU];
3217 if (OldCycle != NewCycle) {
3218 InstrToCycle[&SU] = NewCycle;
3219 auto &OldS = getInstructions(OldCycle);
3220 llvm::erase(OldS, &SU);
3221 getInstructions(NewCycle).emplace_back(&SU);
3222 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3223 << ") is not pipelined; moving from cycle " << OldCycle
3224 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3225 }
3226 NewLastCycle = std::max(NewLastCycle, NewCycle);
3227 }
3228 LastCycle = NewLastCycle;
3229 return true;
3230}
3231
3232// Check if the generated schedule is valid. This function checks if
3233// an instruction that uses a physical register is scheduled in a
3234// different stage than the definition. The pipeliner does not handle
3235// physical register values that may cross a basic block boundary.
3236// Furthermore, if a physical def/use pair is assigned to the same
3237// cycle, orderDependence does not guarantee def/use ordering, so that
3238// case should be considered invalid. (The test checks for both
3239// earlier and same-cycle use to be more robust.)
3241 for (SUnit &SU : SSD->SUnits) {
3242 if (!SU.hasPhysRegDefs)
3243 continue;
3244 int StageDef = stageScheduled(&SU);
3245 int CycleDef = InstrToCycle[&SU];
3246 assert(StageDef != -1 && "Instruction should have been scheduled.");
3247 for (auto &SI : SU.Succs)
3248 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3249 if (Register::isPhysicalRegister(SI.getReg())) {
3250 if (stageScheduled(SI.getSUnit()) != StageDef)
3251 return false;
3252 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3253 return false;
3254 }
3255 }
3256 return true;
3257}
3258
3259/// A property of the node order in swing-modulo-scheduling is
3260/// that for nodes outside circuits the following holds:
3261/// none of them is scheduled after both a successor and a
3262/// predecessor.
3263/// The method below checks whether the property is met.
3264/// If not, debug information is printed and statistics information updated.
3265/// Note that we do not use an assert statement.
3266/// The reason is that although an invalid node oder may prevent
3267/// the pipeliner from finding a pipelined schedule for arbitrary II,
3268/// it does not lead to the generation of incorrect code.
3269void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3270
3271 // a sorted vector that maps each SUnit to its index in the NodeOrder
3272 typedef std::pair<SUnit *, unsigned> UnitIndex;
3273 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3274
3275 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3276 Indices.push_back(std::make_pair(NodeOrder[i], i));
3277
3278 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3279 return std::get<0>(i1) < std::get<0>(i2);
3280 };
3281
3282 // sort, so that we can perform a binary search
3283 llvm::sort(Indices, CompareKey);
3284
3285 bool Valid = true;
3286 (void)Valid;
3287 // for each SUnit in the NodeOrder, check whether
3288 // it appears after both a successor and a predecessor
3289 // of the SUnit. If this is the case, and the SUnit
3290 // is not part of circuit, then the NodeOrder is not
3291 // valid.
3292 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3293 SUnit *SU = NodeOrder[i];
3294 unsigned Index = i;
3295
3296 bool PredBefore = false;
3297 bool SuccBefore = false;
3298
3299 SUnit *Succ;
3300 SUnit *Pred;
3301 (void)Succ;
3302 (void)Pred;
3303
3304 for (SDep &PredEdge : SU->Preds) {
3305 SUnit *PredSU = PredEdge.getSUnit();
3306 unsigned PredIndex = std::get<1>(
3307 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3308 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3309 PredBefore = true;
3310 Pred = PredSU;
3311 break;
3312 }
3313 }
3314
3315 for (SDep &SuccEdge : SU->Succs) {
3316 SUnit *SuccSU = SuccEdge.getSUnit();
3317 // Do not process a boundary node, it was not included in NodeOrder,
3318 // hence not in Indices either, call to std::lower_bound() below will
3319 // return Indices.end().
3320 if (SuccSU->isBoundaryNode())
3321 continue;
3322 unsigned SuccIndex = std::get<1>(
3323 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3324 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3325 SuccBefore = true;
3326 Succ = SuccSU;
3327 break;
3328 }
3329 }
3330
3331 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3332 // instructions in circuits are allowed to be scheduled
3333 // after both a successor and predecessor.
3334 bool InCircuit = llvm::any_of(
3335 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3336 if (InCircuit)
3337 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3338 else {
3339 Valid = false;
3340 NumNodeOrderIssues++;
3341 LLVM_DEBUG(dbgs() << "Predecessor ";);
3342 }
3343 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3344 << " are scheduled before node " << SU->NodeNum
3345 << "\n";);
3346 }
3347 }
3348
3349 LLVM_DEBUG({
3350 if (!Valid)
3351 dbgs() << "Invalid node order found!\n";
3352 });
3353}
3354
3355/// Attempt to fix the degenerate cases when the instruction serialization
3356/// causes the register lifetimes to overlap. For example,
3357/// p' = store_pi(p, b)
3358/// = load p, offset
3359/// In this case p and p' overlap, which means that two registers are needed.
3360/// Instead, this function changes the load to use p' and updates the offset.
3361void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3362 unsigned OverlapReg = 0;
3363 unsigned NewBaseReg = 0;
3364 for (SUnit *SU : Instrs) {
3365 MachineInstr *MI = SU->getInstr();
3366 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3367 const MachineOperand &MO = MI->getOperand(i);
3368 // Look for an instruction that uses p. The instruction occurs in the
3369 // same cycle but occurs later in the serialized order.
3370 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3371 // Check that the instruction appears in the InstrChanges structure,
3372 // which contains instructions that can have the offset updated.
3374 InstrChanges.find(SU);
3375 if (It != InstrChanges.end()) {
3376 unsigned BasePos, OffsetPos;
3377 // Update the base register and adjust the offset.
3378 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3380 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3381 int64_t NewOffset =
3382 MI->getOperand(OffsetPos).getImm() - It->second.second;
3383 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3384 SU->setInstr(NewMI);
3385 MISUnitMap[NewMI] = SU;
3386 NewMIs[MI] = NewMI;
3387 }
3388 }
3389 OverlapReg = 0;
3390 NewBaseReg = 0;
3391 break;
3392 }
3393 // Look for an instruction of the form p' = op(p), which uses and defines
3394 // two virtual registers that get allocated to the same physical register.
3395 unsigned TiedUseIdx = 0;
3396 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3397 // OverlapReg is p in the example above.
3398 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3399 // NewBaseReg is p' in the example above.
3400 NewBaseReg = MI->getOperand(i).getReg();
3401 break;
3402 }
3403 }
3404 }
3405}
3406
3407std::deque<SUnit *>
3409 const std::deque<SUnit *> &Instrs) const {
3410 std::deque<SUnit *> NewOrderPhi;
3411 for (SUnit *SU : Instrs) {
3412 if (SU->getInstr()->isPHI())
3413 NewOrderPhi.push_back(SU);
3414 }
3415 std::deque<SUnit *> NewOrderI;
3416 for (SUnit *SU : Instrs) {
3417 if (!SU->getInstr()->isPHI())
3418 orderDependence(SSD, SU, NewOrderI);
3419 }
3420 llvm::append_range(NewOrderPhi, NewOrderI);
3421 return NewOrderPhi;
3422}
3423
3424/// After the schedule has been formed, call this function to combine
3425/// the instructions from the different stages/cycles. That is, this
3426/// function creates a schedule that represents a single iteration.
3428 // Move all instructions to the first stage from later stages.
3429 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3430 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3431 ++stage) {
3432 std::deque<SUnit *> &cycleInstrs =
3433 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3434 for (SUnit *SU : llvm::reverse(cycleInstrs))
3435 ScheduledInstrs[cycle].push_front(SU);
3436 }
3437 }
3438
3439 // Erase all the elements in the later stages. Only one iteration should
3440 // remain in the scheduled list, and it contains all the instructions.
3441 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3442 ScheduledInstrs.erase(cycle);
3443
3444 // Change the registers in instruction as specified in the InstrChanges
3445 // map. We need to use the new registers to create the correct order.
3446 for (const SUnit &SU : SSD->SUnits)
3447 SSD->applyInstrChange(SU.getInstr(), *this);
3448
3449 // Reorder the instructions in each cycle to fix and improve the
3450 // generated code.
3451 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3452 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3453 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3454 SSD->fixupRegisterOverlaps(cycleInstrs);
3455 }
3456
3457 LLVM_DEBUG(dump(););
3458}
3459
3461 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3462 << " depth " << MaxDepth << " col " << Colocate << "\n";
3463 for (const auto &I : Nodes)
3464 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3465 os << "\n";
3466}
3467
3468#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3469/// Print the schedule information to the given output.
3471 // Iterate over each cycle.
3472 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3473 // Iterate over each instruction in the cycle.
3474 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3475 for (SUnit *CI : cycleInstrs->second) {
3476 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3477 os << "(" << CI->NodeNum << ") ";
3478 CI->getInstr()->print(os);
3479 os << "\n";
3480 }
3481 }
3482}
3483
3484/// Utility function used for debugging to print the schedule.
3487
3488void ResourceManager::dumpMRT() const {
3489 LLVM_DEBUG({
3490 if (UseDFA)
3491 return;
3492 std::stringstream SS;
3493 SS << "MRT:\n";
3494 SS << std::setw(4) << "Slot";
3495 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3496 SS << std::setw(3) << I;
3497 SS << std::setw(7) << "#Mops"
3498 << "\n";
3499 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3500 SS << std::setw(4) << Slot;
3501 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3502 SS << std::setw(3) << MRT[Slot][I];
3503 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3504 }
3505 dbgs() << SS.str();
3506 });
3507}
3508#endif
3509
3511 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3512 unsigned ProcResourceID = 0;
3513
3514 // We currently limit the resource kinds to 64 and below so that we can use
3515 // uint64_t for Masks
3516 assert(SM.getNumProcResourceKinds() < 64 &&
3517 "Too many kinds of resources, unsupported");
3518 // Create a unique bitmask for every processor resource unit.
3519 // Skip resource at index 0, since it always references 'InvalidUnit'.
3520 Masks.resize(SM.getNumProcResourceKinds());
3521 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3523 if (Desc.SubUnitsIdxBegin)
3524 continue;
3525 Masks[I] = 1ULL << ProcResourceID;
3526 ProcResourceID++;
3527 }
3528 // Create a unique bitmask for every processor resource group.
3529 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3531 if (!Desc.SubUnitsIdxBegin)
3532 continue;
3533 Masks[I] = 1ULL << ProcResourceID;
3534 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3535 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3536 ProcResourceID++;
3537 }
3538 LLVM_DEBUG({
3539 if (SwpShowResMask) {
3540 dbgs() << "ProcResourceDesc:\n";
3541 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3542 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3543 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3544 ProcResource->Name, I, Masks[I],
3545 ProcResource->NumUnits);
3546 }
3547 dbgs() << " -----------------\n";
3548 }
3549 });
3550}
3551
3553 LLVM_DEBUG({
3554 if (SwpDebugResource)
3555 dbgs() << "canReserveResources:\n";
3556 });
3557 if (UseDFA)
3558 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3559 ->canReserveResources(&SU.getInstr()->getDesc());
3560
3561 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3562 if (!SCDesc->isValid()) {
3563 LLVM_DEBUG({
3564 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3565 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3566 });
3567 return true;
3568 }
3569
3570 reserveResources(SCDesc, Cycle);
3571 bool Result = !isOverbooked();
3572 unreserveResources(SCDesc, Cycle);
3573
3574 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3575 return Result;
3576}
3577
3578void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3579 LLVM_DEBUG({
3580 if (SwpDebugResource)
3581 dbgs() << "reserveResources:\n";
3582 });
3583 if (UseDFA)
3584 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3585 ->reserveResources(&SU.getInstr()->getDesc());
3586
3587 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3588 if (!SCDesc->isValid()) {
3589 LLVM_DEBUG({
3590 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3591 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3592 });
3593 return;
3594 }
3595
3596 reserveResources(SCDesc, Cycle);
3597
3598 LLVM_DEBUG({
3599 if (SwpDebugResource) {
3600 dumpMRT();
3601 dbgs() << "reserveResources: done!\n\n";
3602 }
3603 });
3604}
3605
3606void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3607 int Cycle) {
3608 assert(!UseDFA);
3609 for (const MCWriteProcResEntry &PRE : make_range(
3610 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3611 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3612 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3613
3614 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3615 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3616}
3617
3618void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3619 int Cycle) {
3620 assert(!UseDFA);
3621 for (const MCWriteProcResEntry &PRE : make_range(
3622 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3623 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3624 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3625
3626 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3627 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3628}
3629
3630bool ResourceManager::isOverbooked() const {
3631 assert(!UseDFA);
3632 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3633 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3635 if (MRT[Slot][I] > Desc->NumUnits)
3636 return true;
3637 }
3638 if (NumScheduledMops[Slot] > IssueWidth)
3639 return true;
3640 }
3641 return false;
3642}
3643
3644int ResourceManager::calculateResMIIDFA() const {
3645 assert(UseDFA);
3646
3647 // Sort the instructions by the number of available choices for scheduling,
3648 // least to most. Use the number of critical resources as the tie breaker.
3649 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3650 for (SUnit &SU : DAG->SUnits)
3651 FUS.calcCriticalResources(*SU.getInstr());
3653 FuncUnitOrder(FUS);
3654
3655 for (SUnit &SU : DAG->SUnits)
3656 FuncUnitOrder.push(SU.getInstr());
3657
3659 Resources.push_back(
3660 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3661
3662 while (!FuncUnitOrder.empty()) {
3663 MachineInstr *MI = FuncUnitOrder.top();
3664 FuncUnitOrder.pop();
3665 if (TII->isZeroCost(MI->getOpcode()))
3666 continue;
3667
3668 // Attempt to reserve the instruction in an existing DFA. At least one
3669 // DFA is needed for each cycle.
3670 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3671 unsigned ReservedCycles = 0;
3672 auto *RI = Resources.begin();
3673 auto *RE = Resources.end();
3674 LLVM_DEBUG({
3675 dbgs() << "Trying to reserve resource for " << NumCycles
3676 << " cycles for \n";
3677 MI->dump();
3678 });
3679 for (unsigned C = 0; C < NumCycles; ++C)
3680 while (RI != RE) {
3681 if ((*RI)->canReserveResources(*MI)) {
3682 (*RI)->reserveResources(*MI);
3683 ++ReservedCycles;
3684 break;
3685 }
3686 RI++;
3687 }
3688 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3689 << ", NumCycles:" << NumCycles << "\n");
3690 // Add new DFAs, if needed, to reserve resources.
3691 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3693 << "NewResource created to reserve resources"
3694 << "\n");
3695 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3696 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3697 NewResource->reserveResources(*MI);
3698 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3699 }
3700 }
3701
3702 int Resmii = Resources.size();
3703 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3704 return Resmii;
3705}
3706
3708 if (UseDFA)
3709 return calculateResMIIDFA();
3710
3711 // Count each resource consumption and divide it by the number of units.
3712 // ResMII is the max value among them.
3713
3714 int NumMops = 0;
3716 for (SUnit &SU : DAG->SUnits) {
3717 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3718 continue;
3719
3720 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3721 if (!SCDesc->isValid())
3722 continue;
3723
3724 LLVM_DEBUG({
3725 if (SwpDebugResource) {
3726 DAG->dumpNode(SU);
3727 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3728 << " WriteProcRes: ";
3729 }
3730 });
3731 NumMops += SCDesc->NumMicroOps;
3732 for (const MCWriteProcResEntry &PRE :
3733 make_range(STI->getWriteProcResBegin(SCDesc),
3734 STI->getWriteProcResEnd(SCDesc))) {
3735 LLVM_DEBUG({
3736 if (SwpDebugResource) {
3737 const MCProcResourceDesc *Desc =
3738 SM.getProcResource(PRE.ProcResourceIdx);
3739 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3740 }
3741 });
3742 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3743 }
3744 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3745 }
3746
3747 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3748 LLVM_DEBUG({
3749 if (SwpDebugResource)
3750 dbgs() << "#Mops: " << NumMops << ", "
3751 << "IssueWidth: " << IssueWidth << ", "
3752 << "Cycles: " << Result << "\n";
3753 });
3754
3755 LLVM_DEBUG({
3756 if (SwpDebugResource) {
3757 std::stringstream SS;
3758 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3759 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3760 << "\n";
3761 dbgs() << SS.str();
3762 }
3763 });
3764 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3766 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3767 LLVM_DEBUG({
3768 if (SwpDebugResource) {
3769 std::stringstream SS;
3770 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3771 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3772 << std::setw(10) << Cycles << "\n";
3773 dbgs() << SS.str();
3774 }
3775 });
3776 if (Cycles > Result)
3777 Result = Cycles;
3778 }
3779 return Result;
3780}
3781
3783 InitiationInterval = II;
3784 DFAResources.clear();
3785 DFAResources.resize(II);
3786 for (auto &I : DFAResources)
3787 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3788 MRT.clear();
3790 NumScheduledMops.clear();
3791 NumScheduledMops.resize(II);
3792}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:336
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:566
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:930
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:756
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, SwingSchedulerDAG *DAG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:385
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:436
iterator end() const
Definition: SmallPtrSet.h:461
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
iterator begin() const
Definition: SmallPtrSet.h:456
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:503
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
void resize(size_type N)
Definition: SmallVector.h:652
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:826
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2065
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.